virtiofsd-1.13.0/.cargo_vcs_info.json0000644000000001360000000000100131270ustar { "git": { "sha1": "3bf77b7cfa42b23935968c757a4f22ed31bc35d6" }, "path_in_vcs": "" }virtiofsd-1.13.0/.gitignore000064400000000000000000000000231046102023000137020ustar 00000000000000/target **/*.rs.bk virtiofsd-1.13.0/50-virtiofsd.json000064400000000000000000000002541046102023000150460ustar 00000000000000{ "description": "virtiofsd vhost-user-fs", "type": "fs", "binary": "/usr/libexec/virtiofsd", "features": [ "migrate-precopy", "separate-options" ] } virtiofsd-1.13.0/CONTRIBUTING.md000064400000000000000000000052721046102023000141560ustar 00000000000000# Contributing to virtiofsd virtiofsd is an open source project licensed under the [Apache v2 License](https://opensource.org/licenses/Apache-2.0) and the [BSD 3 Clause](https://opensource.org/licenses/BSD-3-Clause) license. ## Coding Style We follow the [Rust Style](https://github.com/rust-dev-tools/fmt-rfcs/blob/master/guide/guide.md) convention and enforce it through the Continuous Integration (CI) process calling into `rustfmt` for each submitted Pull Request (PR). ## Certificate of Origin In order to get a clear contribution chain of trust we use the [signed-off-by language](https://01.org/community/signed-process) used by the Linux kernel project. ## Patch format Beside the signed-off-by footer, we expect each patch to comply with the following format: ``` Change summary More detailed explanation of your changes: Why and how. Wrap it to 72 characters. See http://chris.beams.io/posts/git-commit/ for some more good pieces of advice. Signed-off-by: ``` For example: ``` Implement support for optional sandboxing Implement support for setting up a sandbox for running the service. The technique for this has been borrowed from virtiofsd, and consists on switching to new PID, mount and network namespaces, and then switching root to the directory to be shared. Future patches will implement additional hardening features like dropping capabilities and seccomp filters. Signed-off-by: Sergio Lopez ``` ## Pull requests virtiofsd uses the “fork-and-merge” development model. Follow these steps if you want to merge your changes to `virtiofsd`: 1. Fork the [virtiofsd](https://gitlab.com/virtio-fs/virtiofsd) project into your GitLab organization. 2. Within your fork, create a branch for your contribution. 3. [Create a merge request](https://docs.gitlab.com/ee/user/project/merge_requests/creating_merge_requests.html) against the master branch of the virtiofsd repository. 4. Once the merge request is approved, one of the maintainers will merge it. ## Issue tracking If you have a problem, please let us know. We recommend using [gitlab issues](https://gitlab.com/virtio-fs/virtiofsd/-/issues/new) for formally reporting and documenting them. You can also contact us via email through the [virtio-fs mailing list](https://www.redhat.com/mailman/listinfo/virtio-fs). ## Closing issues You can either close issues manually by adding the fixing commit SHA1 to the issue comments or by adding the `Fixes` keyword to your commit message. After the corresponding MR is merged, GitLab will automatically close that issue when parsing the [commit message](https://docs.gitlab.com/ee/user/project/issues/managing_issues.html#closing-issues-automatically). virtiofsd-1.13.0/Cargo.lock0000644000000647170000000000100111210ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 4 [[package]] name = "aho-corasick" version = "0.7.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" dependencies = [ "memchr", ] [[package]] name = "anstream" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", "is-terminal", "utf8parse", ] [[package]] name = "anstyle" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a30da5c5f2d5e72842e00bcb57657162cdabef0931f40e2deb9b4140440cecd" [[package]] name = "anstyle-parse" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" dependencies = [ "windows-sys", ] [[package]] name = "anstyle-wincon" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188" dependencies = [ "anstyle", "windows-sys", ] [[package]] name = "arc-swap" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c5d78ce20460b82d3fa150275ed9d55e21064fc7951177baacf86a145c4a4b1f" [[package]] name = "atomic-polyfill" version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3ff7eb3f316534d83a8a2c3d1674ace8a5a71198eba31e2e2b597833f699b28" dependencies = [ "critical-section", ] [[package]] name = "atty" version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" dependencies = [ "hermit-abi 0.1.19", "libc", "winapi", ] [[package]] name = "autocfg" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "bitflags" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" [[package]] name = "btree-range-map" version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1be5c9672446d3800bcbcaabaeba121fe22f1fb25700c4562b22faf76d377c33" dependencies = [ "btree-slab", "cc-traits", "range-traits", "slab", ] [[package]] name = "btree-slab" version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a2b56d3029f075c4fa892428a098425b86cef5c89ae54073137ece416aef13c" dependencies = [ "cc-traits", "slab", "smallvec", ] [[package]] name = "byteorder" version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "capng" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6f8e9448233603643e42606121d95f5f8d4e015b3e7619a51593864dd902575" dependencies = [ "bitflags 1.3.2", "libc", ] [[package]] name = "cc" version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" [[package]] name = "cc-traits" version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "060303ef31ef4a522737e1b1ab68c67916f2a787bb2f4f54f383279adba962b5" dependencies = [ "slab", ] [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" version = "4.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1640e5cc7fb47dbb8338fd471b105e7ed6c3cb2aeb00c2e067127ffd3764a05d" dependencies = [ "clap_builder", "clap_derive", "once_cell", ] [[package]] name = "clap_builder" version = "4.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "98c59138d527eeaf9b53f35a77fcc1fad9d883116070c63d5de1c7dc7b00c72b" dependencies = [ "anstream", "anstyle", "clap_lex", "strsim", ] [[package]] name = "clap_derive" version = "4.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8cd2b2a819ad6eec39e8f1d6b53001af1e5469f8c177579cdaeb313115b825f" dependencies = [ "heck", "proc-macro2", "quote", "syn 2.0.32", ] [[package]] name = "clap_lex" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" [[package]] name = "cobs" version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67ba02a97a2bd10f4b59b25c7973101c79642302776489e030cd13cdab09ed15" [[package]] name = "colorchoice" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" [[package]] name = "critical-section" version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7059fff8937831a9ae6f0fe4d658ffabf58f2ca96aa9dec1c889f936f705f216" [[package]] name = "env_logger" version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a19187fea3ac7e84da7dacf48de0c45d63c6a76f9490dae389aead16c243fce3" dependencies = [ "atty", "humantime", "log", "regex", "termcolor", ] [[package]] name = "errno" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" dependencies = [ "errno-dragonfly", "libc", "windows-sys", ] [[package]] name = "errno-dragonfly" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" dependencies = [ "cc", "libc", ] [[package]] name = "error-chain" version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d2f06b9cac1506ece98fe3231e3cc9c4410ec3d5b1f24ae1c8946f0742cdefc" dependencies = [ "version_check", ] [[package]] name = "futures" version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f73fe65f54d1e12b726f517d3e2135ca3125a437b6d998caf1962961f7172d9e" dependencies = [ "futures-channel", "futures-core", "futures-executor", "futures-io", "futures-sink", "futures-task", "futures-util", ] [[package]] name = "futures-channel" version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3083ce4b914124575708913bca19bfe887522d6e2e6d0952943f5eac4a74010" dependencies = [ "futures-core", "futures-sink", ] [[package]] name = "futures-core" version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3" [[package]] name = "futures-executor" version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9420b90cfa29e327d0429f19be13e7ddb68fa1cccb09d65e5706b8c7a749b8a6" dependencies = [ "futures-core", "futures-task", "futures-util", "num_cpus", ] [[package]] name = "futures-io" version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc4045962a5a5e935ee2fdedaa4e08284547402885ab326734432bed5d12966b" [[package]] name = "futures-macro" version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512" dependencies = [ "proc-macro2", "quote", "syn 1.0.98", ] [[package]] name = "futures-sink" version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "21163e139fa306126e6eedaf49ecdb4588f939600f0b1e770f4205ee4b7fa868" [[package]] name = "futures-task" version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c66a976bf5909d801bbef33416c41372779507e7a6b3a5e25e4749c58f776a" [[package]] name = "futures-util" version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a" dependencies = [ "futures-channel", "futures-core", "futures-io", "futures-macro", "futures-sink", "futures-task", "memchr", "pin-project-lite", "pin-utils", "slab", ] [[package]] name = "getrandom" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "libc", "wasi", ] [[package]] name = "hash32" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67" dependencies = [ "byteorder", ] [[package]] name = "heapless" version = "0.7.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db04bc24a18b9ea980628ecf00e6c0264f3c1426dac36c00cb49b6fbad8b0743" dependencies = [ "atomic-polyfill", "hash32", "rustc_version", "serde", "spin", "stable_deref_trait", ] [[package]] name = "heck" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "hermit-abi" version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" dependencies = [ "libc", ] [[package]] name = "hermit-abi" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" [[package]] name = "hostname" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c731c3e10504cc8ed35cfe2f1db4c9274c3d35fa486e3b31df46f068ef3e867" dependencies = [ "libc", "match_cfg", "winapi", ] [[package]] name = "humantime" version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "is-terminal" version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ "hermit-abi 0.3.2", "rustix", "windows-sys", ] [[package]] name = "itoa" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112c678d4050afce233f4f2852bb2eb519230b3cf12f33585275537d7e41578d" [[package]] name = "libc" version = "0.2.155" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" [[package]] name = "libseccomp-sys" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a7cbbd4ad467251987c6e5b47d53b11a5a05add08f2447a9e2d70aef1e0d138" [[package]] name = "linux-raw-sys" version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57bcfdad1b858c2db7c38303a6d2ad4dfaf5eb53dfeb0910128b2c26d6158503" [[package]] name = "lock_api" version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" dependencies = [ "autocfg", "scopeguard", ] [[package]] name = "log" version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" dependencies = [ "cfg-if", ] [[package]] name = "match_cfg" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffbee8634e0d45d258acb448e7eaab3fce7a0a467395d4d9f228e3c1f01fb2e4" [[package]] name = "memchr" version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "num_cpus" version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" dependencies = [ "hermit-abi 0.1.19", "libc", ] [[package]] name = "num_threads" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" dependencies = [ "libc", ] [[package]] name = "once_cell" version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "pin-project-lite" version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" [[package]] name = "pin-utils" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "postcard" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c9ee729232311d3cd113749948b689627618133b1c5012b77342c1950b25eaeb" dependencies = [ "cobs", "heapless", "serde", ] [[package]] name = "ppv-lite86" version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" dependencies = [ "zerocopy", ] [[package]] name = "proc-macro2" version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b368fba921b0dce7e60f5e04ec15e565b3303972b42bcfde1d0713b881959eb" dependencies = [ "unicode-ident", ] [[package]] name = "quote" version = "1.0.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "573015e8ab27661678357f27dc26460738fd2b6c86e46f386fde94cb5d913105" dependencies = [ "proc-macro2", ] [[package]] name = "rand" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", "rand_chacha", "rand_core", ] [[package]] name = "rand_chacha" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", "rand_core", ] [[package]] name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ "getrandom", ] [[package]] name = "range-traits" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d20581732dd76fa913c7dff1a2412b714afe3573e94d41c34719de73337cc8ab" [[package]] name = "regex" version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b" dependencies = [ "aho-corasick", "memchr", "regex-syntax", ] [[package]] name = "regex-syntax" version = "0.6.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" [[package]] name = "rustc_version" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ "semver", ] [[package]] name = "rustix" version = "0.38.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "172891ebdceb05aa0005f533a6cbfca599ddd7d966f6f5d4d9b2e70478e70399" dependencies = [ "bitflags 2.4.1", "errno", "libc", "linux-raw-sys", "windows-sys", ] [[package]] name = "scopeguard" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "semver" version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b0293b4b29daaf487284529cc2f5675b8e57c61f70167ba415a463651fd6a918" [[package]] name = "serde" version = "1.0.168" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d614f89548720367ded108b3c843be93f3a341e22d5674ca0dd5cd57f34926af" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" version = "1.0.168" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4fe589678c688e44177da4f27152ee2d190757271dc7f1d5b6b9f68d869d641" dependencies = [ "proc-macro2", "quote", "syn 2.0.32", ] [[package]] name = "slab" version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef" dependencies = [ "autocfg", ] [[package]] name = "smallvec" version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "spin" version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" dependencies = [ "lock_api", ] [[package]] name = "stable_deref_trait" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" [[package]] name = "strsim" version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "syn" version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c50aef8a904de4c23c788f104b7dddc7d6f79c647c7c8ce4cc8f73eb0ca773dd" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "syn" version = "2.0.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "syslog" version = "6.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dfc7e95b5b795122fafe6519e27629b5ab4232c73ebb2428f568e82b1a457ad3" dependencies = [ "error-chain", "hostname", "libc", "log", "time", ] [[package]] name = "termcolor" version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755" dependencies = [ "winapi-util", ] [[package]] name = "thiserror" version = "1.0.41" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c16a64ba9387ef3fdae4f9c1a7f07a0997fce91985c0336f1ddc1822b3b37802" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" version = "1.0.41" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d14928354b01c4d6a4f0e549069adef399a284e7995c7ccca94e8a07a5346c59" dependencies = [ "proc-macro2", "quote", "syn 2.0.32", ] [[package]] name = "time" version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72c91f41dcb2f096c05f0873d667dceec1087ce5bcf984ec8ffb19acddbb3217" dependencies = [ "itoa", "libc", "num_threads", ] [[package]] name = "unicode-ident" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "15c61ba63f9235225a22310255a29b806b907c9b8c964bcbd0a2c70f3f2deea7" [[package]] name = "utf8parse" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "uuid" version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" dependencies = [ "getrandom", "rand", "uuid-macro-internal", ] [[package]] name = "uuid-macro-internal" version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b91f57fe13a38d0ce9e28a03463d8d3c2468ed03d75375110ec71d93b449a08" dependencies = [ "proc-macro2", "quote", "syn 2.0.32", ] [[package]] name = "version_check" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" [[package]] name = "vhost" version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bce0aad4d8776cb64f1ac591e908a561c50ba6adac4416296efee590b155623f" dependencies = [ "bitflags 2.4.1", "libc", "uuid", "vm-memory", "vmm-sys-util", ] [[package]] name = "vhost-user-backend" version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daa03d476437d005abd2dec0970c468ed2a692e6a0604b834699680e171de942" dependencies = [ "libc", "log", "vhost", "virtio-bindings", "virtio-queue", "vm-memory", "vmm-sys-util", ] [[package]] name = "virtio-bindings" version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1711e61c00f8cb450bd15368152a1e37a12ef195008ddc7d0f4812f9e2b30a68" [[package]] name = "virtio-queue" version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "872e2f3fbd70a7e6f01689720cce3d5c2c5efe52b484dd07b674246ada0e9a8d" dependencies = [ "log", "virtio-bindings", "vm-memory", "vmm-sys-util", ] [[package]] name = "virtiofsd" version = "1.13.0" dependencies = [ "bitflags 1.3.2", "btree-range-map", "capng", "clap", "env_logger", "futures", "libc", "libseccomp-sys", "log", "postcard", "serde", "syslog", "vhost", "vhost-user-backend", "virtio-bindings", "virtio-queue", "vm-memory", "vmm-sys-util", ] [[package]] name = "vm-memory" version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2919f87420b6998a131eb7c78843890295e91a3f8f786ccc925c8d387b75121" dependencies = [ "arc-swap", "bitflags 2.4.1", "libc", "thiserror", "vmm-sys-util", "winapi", ] [[package]] name = "vmm-sys-util" version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d1435039746e20da4f8d507a72ee1b916f7b4b05af7a91c093d2c6561934ede" dependencies = [ "bitflags 1.3.2", "libc", ] [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "winapi" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" dependencies = [ "winapi-i686-pc-windows-gnu", "winapi-x86_64-pc-windows-gnu", ] [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" dependencies = [ "winapi", ] [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-sys" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ "windows-targets", ] [[package]] name = "windows-targets" version = "0.48.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", "windows_i686_gnu", "windows_i686_msvc", "windows_x86_64_gnu", "windows_x86_64_gnullvm", "windows_x86_64_msvc", ] [[package]] name = "windows_aarch64_gnullvm" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" [[package]] name = "windows_aarch64_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" [[package]] name = "windows_i686_gnu" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" [[package]] name = "windows_i686_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" [[package]] name = "windows_x86_64_gnu" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" [[package]] name = "windows_x86_64_gnullvm" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" [[package]] name = "windows_x86_64_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" [[package]] name = "zerocopy" version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ "byteorder", "zerocopy-derive", ] [[package]] name = "zerocopy-derive" version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", "syn 2.0.32", ] virtiofsd-1.13.0/Cargo.toml0000644000000041510000000000100111260ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2018" name = "virtiofsd" version = "1.13.0" authors = ["The Virtiofs Project Developers"] build = false exclude = [".gitlab-ci.yml"] autobins = false autoexamples = false autotests = false autobenches = false description = "A virtio-fs vhost-user device daemon" homepage = "https://virtio-fs.gitlab.io/" readme = "README.md" license = "Apache-2.0 AND BSD-3-Clause" repository = "https://gitlab.com/virtio-fs/virtiofsd" [profile.release] lto = true [lib] name = "virtiofsd" path = "src/lib.rs" [[bin]] name = "virtiofsd" path = "src/main.rs" required-features = ["seccomp"] [dependencies.bitflags] version = "1.2" [dependencies.btree-range-map] version = "0.7" [dependencies.capng] version = "0.2.2" [dependencies.clap] version = "4" features = ["derive"] [dependencies.env_logger] version = "0.8.4" [dependencies.futures] version = "0.3" features = ["thread-pool"] [dependencies.libc] version = "0.2.155" [dependencies.libseccomp-sys] version = "0.2" optional = true [dependencies.log] version = "0.4" [dependencies.postcard] version = "1.0" features = ["use-std"] [dependencies.serde] version = "1.0" features = ["derive"] [dependencies.syslog] version = "6.1" [dependencies.vhost] version = "0.13.0" [dependencies.vhost-user-backend] version = "0.17.0" [dependencies.virtio-bindings] version = "0.2.1" [dependencies.virtio-queue] version = "0.14.0" [dependencies.vm-memory] version = "0.16.0" features = [ "backend-mmap", "backend-atomic", ] [dependencies.vmm-sys-util] version = "0.12.1" [features] default = ["seccomp"] seccomp = ["dep:libseccomp-sys"] xen = [ "vhost-user-backend/xen", "vhost/xen", "vm-memory/xen", ] virtiofsd-1.13.0/Cargo.toml.orig000064400000000000000000000023761046102023000146160ustar 00000000000000[package] name = "virtiofsd" description = "A virtio-fs vhost-user device daemon" version = "1.13.0" authors = ["The Virtiofs Project Developers"] edition = "2018" homepage = "https://virtio-fs.gitlab.io/" repository = "https://gitlab.com/virtio-fs/virtiofsd" license = "Apache-2.0 AND BSD-3-Clause" readme = "README.md" exclude = [".gitlab-ci.yml"] [features] default = ["seccomp"] # Expose seccomp bindings from the library. seccomp = ["dep:libseccomp-sys"] # Enabling Xen support will _disable_ QEMU/KVM support! xen = ["vhost-user-backend/xen", "vhost/xen", "vm-memory/xen"] [[bin]] name = "virtiofsd" required-features = ["seccomp"] [dependencies] bitflags = "1.2" btree-range-map = "0.7" capng = "0.2.2" env_logger = "0.8.4" futures = { version = "0.3", features = ["thread-pool"] } libc = "0.2.155" log = "0.4" libseccomp-sys = { version = "0.2", optional = true } clap = { version = "4", features = ["derive"] } postcard = { version = "1.0", features = ["use-std"] } serde = { version = "1.0", features = ["derive"] } vhost-user-backend = "0.17.0" vhost = "0.13.0" virtio-bindings = "0.2.1" vm-memory = { version = "0.16.0", features = ["backend-mmap", "backend-atomic"] } virtio-queue = "0.14.0" vmm-sys-util = "0.12.1" syslog = "6.1" [profile.release] lto = true virtiofsd-1.13.0/LICENSE-APACHE000064400000000000000000000261361046102023000136530ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. virtiofsd-1.13.0/LICENSE-BSD-3-Clause000064400000000000000000000030321046102023000147020ustar 00000000000000// Copyright 2017 The Chromium OS Authors. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. virtiofsd-1.13.0/README.md000064400000000000000000000641451046102023000132100ustar 00000000000000# virtiofsd A [virtio-fs](https://virtio-fs.gitlab.io/) vhost-user device daemon written in Rust. ## Building from sources ### Requirements This project depends on [libcap-ng](https://people.redhat.com/sgrubb/libcap-ng/) and [libseccomp](https://github.com/seccomp/libseccomp). You can obtain those dependencies by building them for their respective sources, or by installing the correspondent development packages from your distribution, if available: - Fedora/CentOS/RHEL ```shell dnf install libcap-ng-devel libseccomp-devel ``` - Debian/Ubuntu ```shell apt install libcap-ng-dev libseccomp-dev ``` ### Compiling virtiofsd is written in Rust, so you will have to install [Rust](https://www.rust-lang.org/learn/get-started) in order to compile it, and it uses [cargo](https://doc.rust-lang.org/cargo/) to manage the project and its dependencies. After installing Rust, you can compile it to a binary by running: ```shell cargo build --release ``` ## CI-built binaries Every time new code is merged, the CI pipeline will upload a debug binary of virtiofsd. It is intended to be an accessible way for anyone to download and test virtiofsd without needing a Rust toolchain installed. The debug binary is built only for x86\_64 Linux-based systems. [Click here to download the latest build]( https://gitlab.com/virtio-fs/virtiofsd/-/jobs/artifacts/main/download?job=publish) ## Contributing See [CONTRIBUTING.md](CONTRIBUTING.md) ## Usage This program must be run as the root user or as a "fake" root inside a user namespace (see [Running as non-privileged user](#running-as-non-privileged-user)). The program drops privileges where possible during startup, although it must be able to create and access files with any uid/gid: * The ability to invoke syscalls is limited using `seccomp(2)`. * Linux `capabilities(7)` are dropped. virtiofsd only retains the following capabilities: `CAP_CHOWN`, `CAP_DAC_OVERRIDE`, `CAP_FOWNER`, `CAP_FSETID`, `CAP_SETGID`, `CAP_SETUID`, `CAP_MKNOD`, `CAP_SETFCAP` (and `CAP_DAC_READ_SEARCH` if `--inode-file-handles` is used). ```shell virtiofsd [FLAGS] [OPTIONS] --fd |--socket-path --shared-dir ``` #### Flags ```shell -h, --help ``` Prints help information. ```shell -V, --version ``` Prints version information. ```shell --syslog ``` Log to syslog. Default: stderr. ```shell --print-capabilities ``` Print vhost-user.json backend program capabilities and exit. ```shell --allow-direct-io ``` Honor the `O_DIRECT` flag passed down by guest applications. ```shell --announce-submounts ``` Tell the guest which directories are mount points. If multiple filesystems are mounted in the shared directory, virtiofsd passes inode IDs directly to the guest, and because such IDs are unique only on a single filesystem, it is possible that the guest will encounter duplicates if multiple filesystems are mounted in the shared directory. `--announce-submounts` solves that problem because it reports a different device number for every submount it encounters. In addition, when running with `--announce-submounts`, the client sends one `SYNCFS` request per submount that is to be synced, so virtiofsd will call `syncfs()` on each submount. On the other hand, when running without `--announce-submounts`, the client only sends a `SYNCFS` request for the root mount, this may lead to data loss/corruption. ```shell --no-killpriv-v2 ``` Disable `KILLPRIV V2` support. This is required if the shared directory is an NFS file system. `KILLPRIV V2` support is disabled by default. ```shell --killpriv-v2 ``` Enable `KILLPRIV V2` support. It is disabled by default. ```shell --no-readdirplus ``` Disable support for `READDIRPLUS` operations. ```shell --writeback ``` Enable writeback cache. ```shell --xattr ``` Enable support for extended attributes. ```shell --posix-acl ``` Enable support for posix ACLs (implies --xattr). ```shell --security-label ``` Enable support for security label (SELinux). ```shell --preserve-noatime ``` Always preserve `O_NOATIME`. By default virtiofsd will implicitly clean up `O_NOATIME` to prevent potential permission errors. The option `--preserve-noatime` can be used to override this behavior and preserve the `O_NOATIME` flag specified by the client. ```shell --readonly ``` Prevent write accesses from the guest. Note that this does not make the underlying shared directory an actual read-only mount, so e.g. the access time is still updated on accesses. #### Options ```shell --shared-dir ``` Shared directory path. ```shell --tag ``` The tag that the virtio device advertises. Setting this option will enable advertising of VHOST_USER_PROTOCOL_F_CONFIG. However, the vhost-user frontend of your hypervisor may not negotiate this feature and (or) ignore this value. Notably, QEMU currently (as of 8.1) ignores the CONFIG feature. QEMU versions from 7.1 to 8.0 will crash while attempting to log a warning about not supporting the feature. ```shell --socket-group ``` Name of group for the vhost-user socket. ```shell --socket-path ``` vhost-user socket path. ```shell --fd ``` File descriptor for the listening (not yet connected) socket. ```shell --log-level ``` Log level (error, warn, info, debug, trace, off). Default: info. ```shell --thread-pool-size ``` Maximum thread pool size. A value of "0" disables the pool. Default: 0. ```shell --rlimit-nofile ``` Set maximum number of file descriptors. If the soft limit is greater than 1M or `--rlimit-nofile=0` is passed as parameter, the maximum number of file descriptors is not changed. Default: min(1000000, `/proc/sys/fs/nr_open`). ```shell --modcaps= ``` Modify the list of capabilities, e.g., `--modcaps=+sys_admin:-chown`. Although it is not mandatory, it is recommended to always use the `=` sign, in other case, this will fail `--modcaps -mknod`, because it will be interpreted as two options, instead of the intended `--modcaps=-mknod`. ```shell --sandbox ``` Sandbox mechanism to isolate the daemon process (namespace, chroot, none). - **namespace**: The program switches into a new file system namespace (`namespaces(7)`) and invokes `pivot_root(2)` to make the shared directory tree its root. A new mount (`mount_namespaces(7)`), pid (`pid_namespaces(7)`) and net namespace (`network_namespaces(7)`) is also created to isolate the process. - **chroot**: The program invokes `chroot(2)` to make the shared directory tree its root. This mode is intended for container environments where the container runtime has already set up the namespaces and the program does not have permission to create namespaces itself. - **none**: Do not isolate the daemon (not recommended). Both **namespace** and **chroot** sandbox modes prevent "file system escapes" due to symlinks and other file system objects that might lead to files outside the shared directory. Default: namespace. ```shell --seccomp ``` Action to take when seccomp finds a not allowed syscall (none, kill, log, trap). Default: kill. ```shell --cache ``` The caching policy the file system should use (auto, always, metadata, never). Default: auto. ```shell --allow-mmap ``` For shared directories with `--cache={metadata, never}`, allow files contained in the shared directory to be `mmap`'d. Regardless of the selected cache policy, this option should only be enabled when the file system has exclusive access to the directory. ```shell --inode-file-handles= ``` When to use file handles to reference inodes instead of `O_PATH` file descriptors (never, prefer, mandatory). - **never**: Never use file handles, always use `O_PATH` file descriptors. - **prefer**: Attempt to generate file handles, but fall back to `O_PATH` file descriptors where the underlying filesystem does not support file handles or `CAP_DAC_READ_SEARCH` is not available. Useful when there are various different filesystems under the shared directory and some of them do not support file handles. - **mandatory**: Always use file handles. It will fail if the underlying filesystem does not support file handles or `CAP_DAC_READ_SEARCH` is not available. Using file handles reduces the number of file descriptors virtiofsd keeps open, which is not only helpful with resources, but may also be important in cases where virtiofsd should only have file descriptors open for files that are open in the guest, e.g. to get around bad interactions with NFS's silly renaming (see [NFS FAQ, Section D2: "What is a "silly rename"?"](http://nfs.sourceforge.net/)). Default: never. ```shell --xattrmap ``` Add custom rules for translating extended attributes between host and guest (e.g., `:map::user.virtiofs.:`). For additional details please see [Extended attribute mapping](doc/xattr-mapping.md). ```shell --uid-map=:namespace_uid:host_uid:count: ``` When running virtiofsd as non-root, map a range of UIDs from host to namespace. In order to use this option, the range of subordinate user IDs must have been set up via `subuid(5)`. virtiofsd uses `newuidmap(1)` for non-trivial cases, that requires a valid subuid, to do the mapping. If this option is not provided, virtiofsd will set up a 1-to-1 mapping for current uid. namespace_uid: Beginning of the range of UIDs inside the user namespace. host_uid: Beginning of the range of UIDs outside the user namespace. count: Length of the ranges (both inside and outside the user namespace). For instance, let's assume the invoking UID is 1000 and the content of /etc/subuid is: 1000:100000:65536, which creates 65536 subuids starting at 100000, i.e. the (inclusive) range [100000, 165535], belonging to the actual UID 1000. This range can be mapped to the UIDs [0, 65535] in virtiofsd’s user namespace (i.e. as seen in the guest) via --uid-map=:0:100000:65536:. Alternatively, you can simply map your own UID to a single UID in the namespace: For example, --uid-map=:0:1000:1: would map UID 1000 to root’s UID in the namespace (and thus the guest). ```shell --gid-map=:namespace_gid:host_gid:count: ``` When running virtiofsd as non-root, map a range of GIDs from host to namespace. In order to use this option, the range of subordinate group IDs must have been set up via `subgid(5)`. virtiofsd uses `newgidmap(1)` for non-trivial cases, that requires a valid subgid, to do the mapping. If this option is not provided, virtiofsd will set up a 1-to-1 mapping for current gid. namespace_gid: Beginning of the range of GIDs inside the user namespace. host_gid: Beginning of the range of GIDs outside the user namespace. count: Length of the ranges (both inside and outside the user namespace). For instance, let's assume the invoking GID is 1000 and the content of /etc/subgid is: 1000:100000:65536, which creates 65536 subgids starting at 100000, i.e. the (inclusive) range [100000, 165535], belonging to the actual GID 1000. This range can be mapped to the GIDs [0, 65535] in virtiofsd’s user namespace (i.e. as seen in the guest) via --gid-map=:0:100000:65536:. Alternatively, you can simply map your own GID to a single GID in the namespace: For example, --gid-map=:0:1000:1: would map GID 1000 to root’s GID in the namespace (and thus the guest). ```shell --translate-uid=guest::: --translate-uid=host::: --translate-uid=squash-guest::: --translate-uid=squash-host::: --translate-uid=forbid-guest:: --translate-uid=map::: ``` Set up a map for virtiofsd to internally translate between host and guest UIDs. As opposed to `--uid-map`, this option does not require a user namespace, and may freely be used regardless of whether virtiofsd runs as root or not. Mapping from guest UIDs to host UIDs is independent from the reverse, i.e. setting up a *guest* or *squash-guest* mapping only instructs virtiofsd to follow this guest-to-host mapping, it does not imply any potentially corresponding host-to-guest mapping. The only exception is the prefix-less form, which sets up a bidirectional mapping. - `guest:::`: Maps the range [guest base UID, guest base UID + count) 1:1 to [host base UID, host base UID + count), i.e. `guest UID ↦ host base UID + (guest UID - guest base UID)`. - `host:::`: Reverse of the above, i.e. maps the range [host base UID, host base UID + count) to [guest base UID, guest base UID + count); `host UID ↦ guest base UID + (host UID - host base UID)`. - `squash-guest:::`: Maps everything in the range [guest base UID, guest base UID + count) to the single given host UID, i.e. `guest UID ↦ host UID`. - `squash-host:::`: Reverse of the above, i.e. maps the range [host base UID, host base UID + count) to the single given guest UID, i.e. `host UID ↦ guest UID`. - `forbid-guest::`: Prohibits use of guest UIDs in the given range: Returns an error to the guest whenever it tries to use a UID in that range for a new file or assign such a UID to an existing file. - `map:::`: Sets up a bidirectional 1:1 mapping between [guest base UID, guest base UID + count) and [host base UID, host base UID + count), i.e. the same as passing both `guest:::` and `host:::`. When giving multiple mappings, their source ranges must not overlap. Neither of `--translate-uid` and `--translate-gid` can be used together with `--posix-acl`; translating UIDs or GIDs in virtiofsd would break posix ACLs. Example use case: virtiofsd runs unprivileged with UID:GID 1001:100. It cannot change its own UID/GID, so attempting to let the guest create files with any other UID/GID combination will fail. By using `--translate-uid` and `--translate-gid`, however, a mapping from guest UIDs/GIDs can be set up such that virtiofsd will create files under the only combination that it can, which is 1001:100. For example, to allow any guest user to create a file, we can squash everything to 1001:100, which will create all those files as 1001:100 on the host. In the guest, we may want to have those files appear as 1000:1000, though, and all other UIDs and GIDs should be visible unchanged in the guest. That would look like so: ```shell virtiofsd [...] \ --translate-uid squash-guest:0:1001:4294967295 \ --translate-gid squash-guest:0:100:4294967295 \ --translate-uid host:1001:1000:1 \ --translate-gid host:100:1000:1 ``` ```shell --translate-gid=::: ``` Same as `--translate-uid`, but for GIDs. ```shell --migration-mode= ``` Defines how to perform migration, i.e. how to represent the internal state to the destination instance, and how to obtain that representation. Note that (when using QEMU) **QEMU version 8.2** or newer is required to use virtio-fs migration. See **doc/migration.md** for a comprehensive explanation on how virtio-fs migration works, what its limitations are, and what configurations we recommend. virtiofsd internally holds references to all inodes indexed or opened by the guest. During migration, these references need to be transferred to the destination; how that is done is determined with this switch: - **find-paths**: For all inodes held by the source instance, look up their paths by reading the symlinks in /proc/self/fd, transfer those paths to the destination, and let the destination instance open those paths. If any inode cannot be located this way, we fall back to iterating through the shared directory (exhaustive search) to find those paths. This allows migration without requiring special privileges, and regardless of whether source and destination use the same shared directory; but is vulnerable to third parties changing metadata in the shared directory while migration is ongoing (e.g. renaming, unlinking, removing permissions), which can potentially lead to data loss and/or corruption. In addition, the fall-back method of iterating through the shared directory is expensive in terms of I/O. - **file-handles**: Has the source instance generate a file handle for each inode, which is sent to the destination and opened there. A file handle is data that uniquely identifies an inode on a filesystem. Consequently, this migration mode requires source and destination to use the same shared directory on the same filesystem; however, source and destination instance need not necessarily be on the same host, if that filesystem is a network filesystem. If the shared directory spans multiple filesystems, they must all be the same in source and destination and have the same mount points inside of the shared directory. Using file handles is comparatively cheap in terms of I/O, and it is resilient against inodes being renamed or unlinked by any party while they are still in use by the guest, as long as the virtiofsd source instance keeps running until migration is fully complete. They do however require the destination instance to have the *DAC_READ_SEARCH* capability, which basically means having to run it as root, and to pass the `--modcaps=+dac_read_search` command line option to it so it does not drop that capability at start-up. This parameter is ignored on the destination side of migration. ```shell --migration-on-error= ``` Controls how to respond to errors during migration. During migration, some inodes that the guest has indexed or opened may turn out not to be migrateable: Either the source instance cannot construct instructions on how the destination instance may be able to find/open some inode, or the destination instance finds itself unable to follow those instructions. In all cases, the destination instance is notified of these inodes, and then decides what to do depending on the value of this parameter: - **abort**: Whenever the destination instance sees any such error, it returns a hard error to the vhost-user front-end (e.g. QEMU), which aborts migration. Execution is to continue on the source VM. - **guest-error**: Migration is allowed to finish, but all affected inodes are marked as invalid. The guest will not be able to access any such inode, receiving only errors. Note that this parameter is to be used purely for the destination instance; its value is ignored on the source side of migration. ```shell --migration-verify-handles ``` Ensure that the migration destination opens the very same inodes as the source. This only works if source and destination are to use the same shared directory on the same filesystem. On migration, the source instance informs the destination instance of all inodes the guest has indexed or opened, and has the destination re-open them. This switch makes the source generate a file handle for each such inode, and send it to the destination, allowing the destination to re-generate the same file handle for the inode it has opened and verify that it is equal, proving it is the same inode. (File handles are per-filesystem unique identifiers for inodes that, besides the inode ID, also include a generation ID to protect against inode ID reuse.) Using this option protects against external parties renaming or replacing inodes while migration is ongoing, which, without this option, can lead to data loss or corruption, so it should always be used when other processes besides virtiofsd have write access to the shared directory. However, again, it only works if both source and destination use the same shared directory; though in the case of network filesystems, this does not require them to run on the same host. This parameter is ignored on the destination side of migration. ```shell --migration-confirm-paths ``` Double-check the identity of inodes right before switching over to the destination, potentially making migration more resilient when third parties have write access to the shared directory. When representing migrated inodes by their paths relative to the shared directory, double-check during switch-over to the destination that each path still matches the respective inode. If a path does not match, try to correct by consulting the respective symbolic link in */proc/self/fd*. Note that this option requires accessing each inode indexed or opened by the guest once during the switch-over stage of migration, when both the source and destination VM are paused, so can prolong that phase for an indeterminate amount of time. This parameter is ignored on the destination side of migration. ### Examples Export `/mnt` on vhost-user UNIX domain socket `/tmp/vfsd.sock`: ```shell host# virtiofsd --socket-path=/tmp/vfsd.sock --shared-dir /mnt \ --announce-submounts --inode-file-handles=mandatory & host# qemu-system \ -blockdev file,node-name=hdd,filename= \ -device virtio-blk,drive=hdd \ -chardev socket,id=char0,path=/tmp/vfsd.sock \ -device vhost-user-fs-pci,queue-size=1024,chardev=char0,tag=myfs \ -object memory-backend-memfd,id=mem,size=4G,share=on \ -numa node,memdev=mem \ -accel kvm -m 4G guest# mount -t virtiofs myfs /mnt ``` See [FAQ](#faq) for adding virtiofs config to an existing qemu command-line. ### Running as non-privileged user When run without root, virtiofsd requires a user namespace (see `user_namespaces(7)`) to be able to switch between arbitrary user/group IDs within the guest. virtiofsd will fail in a user namespace where UIDs/GIDs have not been mapped (i.e., `uid_map` and `gid_map` files have not been written). There are many options to run virtiofsd inside a user namespace. For instance: Let's assume the invoking UID and GID is 1000 and the content of both `/etc/subuid` and `/etc/subgid` are: ``` 1000:100000:65536 ``` Using `podman-unshare(1)` the user namespace will be configured so that the invoking user's UID and primary GID (i.e., 1000) appear to be UID 0 and GID 0, respectively. Any ranges which match that user and group in `/etc/subuid` and `/etc/subgid` are also mapped in as themselves with the help of the `newuidmap(1)` and `newgidmap(1)` helpers: ```shell host$ podman unshare -- virtiofsd --socket-path=/tmp/vfsd.sock --shared-dir /mnt \ --announce-submounts --sandbox chroot & ``` Using `lxc-usernsexec(1)`, we could leave the invoking user outside the mapping, having the root user inside the user namespace mapped to the user and group 100000: ```shell host$ lxc-usernsexec -m b:0:100000:65536 -- virtiofsd --socket-path=/tmp/vfsd.sock \ --shared-dir /mnt --announce-submounts --sandbox chroot & ``` In order to have the same behavior as `podman-unshare(1)`, we need to run ```shell host$ lxc-usernsexec -m b:0:1000:1 -m b:1:100000:65536 -- virtiofsd --socket-path=/tmp/vfsd.sock \ --shared-dir /mnt --announce-submounts --sandbox chroot & ``` We could also select `--sandbox none` instead of `--sandbox chroot`. #### Limitations - Within the guest, it is not possible to create block or char device nodes in the shared directory. - virtiofsd can't use file handles (`--inode-file-handles` requires `CAP_DAC_READ_SEARCH`), so a large number of file descriptors is required. Additionally, on NFS, not using file handles may result in a hidden file lingering after some file is deleted (see [NFS FAQ, Section D2: "What is a "silly rename"?"](http://nfs.sourceforge.net/)). - virtiofsd will not be able to increase `RLIMIT_NOFILE`. ## FAQ - How to read-only-share a directory that cannot be modified within the guest? You can either use virtiofsd’s `--readonly` switch to prevent write accesses from the guest, for instance, exporting `share` ```shell virtiofsd --shared-dir share --readonly ... ``` Or export a read-only mount point: ```shell mkdir ro-share mount -o bind,ro share ro-share virtiofsd --shared-dir ro-share ... ``` - How to share multiple directories with the same virtiofsd? Currently, virtiofsd only supports sharing a single directory, but it is possible to use submounts to achieve this, for instance, exporting `share0`, `share1`: ```shell mkdir -p share/{sh0,sh1} mount -o bind share0 share/sh0 mount -o bind share1 share/sh1 virtiofsd --announce-submounts --shared-dir share ... ``` Note the use of `--announce-submounts` to prevent data loss/corruption. - How to add virtiofs devices to an existing qemu command-line: If `-object memory-backend-memfd,id=mem` and either `-numa node,memdev=mem` or a `memory-backend=mem` property in the `-machine` option have not already been added to the command, add them. If a different memory backend is already configured then it should be changed to `memory-backend-memfd`. `-object memory-backend-memfd` **must** have the option `share=on` and `size=` **must** match the memory size defined by `-m`. For each virtiofs device mount add a `-chardev socket,id=${MATCHING_ID},path=${VIRTIOFSD_SOCKET_PATH}` and `-device vhost-user-fs-pci,queue-size=1024,chardev=${MATCHING_ID},tag=${VIRTIOFS_TAG}` substituting appropriate values for the shell-style variables. ## SELinux Support One can enable support for SELinux by running virtiofsd with option "--security-label". But this will try to save guest's security context in xattr security.selinux on host and it might fail if host's SELinux policy does not permit virtiofsd to do this operation. Hence, it is recommended to remap guest's "security.selinux" xattr to say "trusted.virtiofs.security.selinux" on host. Add following option to command line. "--xattrmap=:map:security.selinux:trusted.virtiofs.:" This will make sure that guest and host's SELinux xattrs on same file remain separate and not interfere with each other. And will allow both host and guest to implement their own separate SELinux policies. Setting trusted xattr on host requires CAP_SYS_ADMIN. So one will need add this capability to daemon. Add following option to command line. "--modcaps=+sys_admin" trusted xattrs are not namespaced. So virtiofsd needs to have CAP_SYS_ADMIN in init_user_ns. IOW, one should not be using user namespaces and virtiofsd should run with CAP_SYS_ADMIN. Giving CAP_SYS_ADMIN increases the risk on system. Now virtiofsd is more powerful and if gets compromised, it can do lot of damage to host system. So keep this trade-off in my mind while making a decision. virtiofsd-1.13.0/doc/migration.md000064400000000000000000000326361046102023000150110ustar 00000000000000Migration with virtio-fs ======================== virtiofsd supports migration through [vhost-user’s device state interface](https://qemu-project.gitlab.io/qemu/interop/vhost-user.html#migrating-back-end-state), allowing it to place internal state into the vhost-user front-end’s (e.g. QEMU’s) migration stream. This allows it to transfer information about files and directories the guest has open to the destination instance. It is very important to note however that virtiofsd never migrates any data, i.e. source and destination are expected to export shared directories with matching contents (e.g. by using the same directory on the same filesystem). If you do not care about any of the details, feel free to skip ahead to the [section explaining recommended configurations](#recommended-configurations). Filesystem State Requirements ----------------------------- As just mentioned, virtiofsd does not migrate any filesystem data, and provides no facilities to do so. The user is responsible for ensuring that the shared directories used by the source and destination instances of virtiofsd have the same content. Specifically, they must have the same content during switch-over, once execution is stopped on the source, until it is resumed on the destination. One way to achieve this is to use the same directory on the same filesystem for both instances, e.g. by using a shared network filesystem. If that is not possible, the contents of the shared directory must be copied (outside of QEMU) from the source to the destination during the switch-over phase. This may be reasonably feasible for a read-only use case, where copying can take place long in advance of the actual migration. Snapshots --------- Because virtiofsd embeds its state into the front-end’s migration stream, it is possible to store this stream somewhere to restore it later, i.e. in a snapshot. From a technical perspective, this is perfectly fine, but it must again be stressed that virtiofsd’s state includes absolutely no data; therefore, some mechanism outside of virtio-fs/virtiofsd must be used to ensure that when restoring such a snapshot, the shared directory is in exactly the same state as it was when the snapshot was taken. What Needs to Be Migrated Anyway? --------------------------------- For every file or directory that is open in the guest, virtiofsd has a corresponding file descriptor (FD) open in the shared directory. The destination instance must restore these FDs, so the source instance must provide instructions on how to do so. The same applies to files and directories the guest does not really have open, but still has their directory entries cached; through FUSE, the guest kernel can reference all such cached entries by associated integer IDs. Therefore, virtiofsd needs to have an internal map that can convert each ID into something that strongly references its associated filesystem object; specifically, either an `O_PATH` FD or a file handle, depending on the `--inode-file-handles` setting. These too need to be transferred in some manner to the destination. Migration Modes --------------- There are two general ways virtiofsd’s internal state can be serialized and migrated, [by path](#by-path---migration-modefind-paths) or [as file handles](#as-file-handles---migration-modefile-handles). ### By Path (`--migration-mode=find-paths`) For every filesystem object that must be transferred to the destination, virtiofsd tries to find its path inside of the shared directory, and transmits that to the destination, which then opens it. Because paths can change, this mode can be quite brittle. virtiofsd begins collecting paths once migration starts (long before the switch-over phase), so any changes to those paths afterwards can lead to various problems, especially if those changes are done by third parties outside of the VM guest. Some examples for such changes are: #### Unlinking Files can exist without paths, specifically when they’re opened but unlinked. Consequently, such files (that may be open in the guest) cannot be migrated using paths. When migrating anyway, the file contents will be lost once the source instance is quit. Note that for files for which virtiofsd cannot find a path, migration will produce an error. The error response behavior is controlled via the destination instance’s `--migration-on-error` switch; `abort` will abort migration (on the destination) when any error occurs, allowing execution to be resumed on the source side, with any FD still open. `guest-error` will continue migration, marking any file that could not be migrated as faulty, returning errors for any guest accesses. #### Renaming / Moving When files or directories are renamed or moved by the migrating guest, virtiofsd is naturally aware of this, and so can update the paths it holds internally. This is not the case when paths are changed outside of virtiofsd, by third parties. In this case, virtiofsd will remain unaware and will send the outdated path to the destination, which will not be able to resolve it (error behavior is then controlled by the `--migration-on-error` switch, as described in the [Unlinking](#unlinking) section). In contrast to the *unlinking* case, it would at least theoretically be possible to migrate these files using their new paths, if virtiofsd somehow could get notified of the rename/move. The `--migration-confirm-paths` option has it double-check each collected path at switch-over time, and so may be able to detect such moves and renames in many cases (but does so on the source side, so still has a non-empty TOCTTOU window). #### Replacing In the *renaming / moving* case, the worst thing that can happen is that a file the guest has open is no longer accessible after migration. A much worse case is when a file is replaced without it being noticed: In this case, the destination will open the other file, but present it as the old one to the guest, with no error indication at all. That can lead to data corruption. The migration destination cannot detect this case without performing specific checks, because opening the path it has received from the source will succeed (but yield the wrong file). Such checks are: * `--migration-verify-handles`: With this switch, source and destination generate a file handle for each transferred path. A file handle is a piece of data that uniquely identifies a filesystem object (like a file or directory), and becomes invalid (“stale”) when that object is deleted; so we can use it to verify a file’s identity between source and destination. However, it only works when source and destination use the same shared directory on the same filesystem (e.g. a network filesystem). Furthermore, any mismatches that are detected cannot be recovered from (i.e. we still don’t know the involved files’ true paths, so `--migration-on-error` will decide how to proceed). * `--migration-confirm-paths`: This switch makes the source instance double-check all paths during switch-over, i.e. when both the source and destination instance are stopped. While this can theoretically allow error recovery (by fetching an updated path from */proc/self/fd*), and does not require source and destination to use the same filesystem, it still leaves a small TOCTTOU window open (between checking and the destination instance opening the paths), and it requires doing potentially quite a bit of I/O (checking paths) during migration downtime, which is generally not desirable. Both switches can also be used together, but they can only be used in *find-paths* migration mode, not *file-handles* (because they simply are not necessary in *file-handles* mode). Check the [dedicated section for more information on recommended configurations](#recommended-configurations). #### Implementation Detail: Collecting Paths There are two ways paths can be collected, either by [looking up FDs in */proc/self/fd*](#querying-procselffd), or by [recursing through the shared directory](#recursing-through-shared-directory). virtiofsd implements both of these, but only uses the latter as a fall-back for when the former fails. ##### Querying /proc/self/fd */proc/self/fd* contains a symbolic link for each file descriptor opened by the current process. These aren’t really symbolic links, though: Opening them does not resolve their link target, but directly opens (basically duplicates) the corresponding file descriptor. Still, these links can have valid targets: The kernel tries to keep track internally what paths the underlying filesystem objects have, and provides this information there. Querying this is thus a much faster way to get a path for our file descriptors than to recurse through the shared directory. The downside is that there is no formal guarantee that this works. It is unclear under what circumstances this can break down; if it does, virtiofsd will fall back to [recursing through the shared directory](#recursing-through-shared-directory). For what it’s worth, the only case we have seen where a file has a valid path, but */proc/self/fd* cannot provide it, is to use its file handle to open the file, when it has not yet been opened through its path. For example: 1. Open file using path 2. Generate and store file handle 3. Unmount file system, then mount it again 4. Open stored file handle Something like this can happen with virtiofsd only on the migration destination instance after a *file-handles* migration; in other cases, virtiofsd will generally open files by path first, giving the kernel a chance to make a note of that path. ##### Recursing Through Shared Directory We can also obtain files’ paths by recursing through the shared directory, enumerating all paths therein, and associating them with the respective files and directories. Naturally, this is quite slow, especially the more files there are in the shared directory, which is why virtiofsd will only fall back to this implementation if it fails to query a path from */proc/self/fd*. ### As File Handles (`--migration-mode=file-handles`) Every filesystem object that must be transferred to the destination is converted to a file handle (a piece of data that uniquely identifies this object on a given filesystem, and can be used to open it), which is sent to the destination. Because there is a unique and permanent relationship between such an object and its file handle, this migration mode is not susceptible to the problems “by path” migration has, for example, a file handle even stays valid when a file has a link count of 0 (i.e. is deleted, has no path anymore) but some process still has it open (i.e. holds an FD). However, because file handles are just some data that allows access to everything on a filesystem without checking e.g. access rights along a file’s path, opening them requires the *DAC_READ_SEARCH* capability, which grants the ability to read any file, regardless of its access mode. Generally, this capability is only available to applications running as root. Furthermore, because file handles are specific to a given filesystem instance, when using them for virtio-fs migration, the source and destination instance must use the same shared directory on the same filesystem, e.g. a shared network filesystem. Recommended Configurations -------------------------- ### General Consider which **`--migration-on-error`** mode suits your needs: * `abort`: When any error is encountered (e.g. destination cannot find a file that is open in the guest), abort migration altogether. You can then generally resume execution on the source; the source virtiofsd instance will retain all open file descriptors until it is quit. * `guest-error`: When encountering errors pertaining to a specific file or directory, do not abort migration, but instead mark that file or directory as invalid. Any guest accesses to it will then result in guest-visible errors. ### Shared Filesystems When source and destination instance use the same shared directory on the same filesystem, using **`--migration-mode=file-handles`** is recommended. This requires the destination instance to have the `DAC_READ_SEARCH` capability. If that capability cannot be provided, we recommend using **`--migration-mode=find-paths`** together with **`--migration-verify-handles`**. Using **`--migration-confirm-paths`** additionally is optional; it can better recover from unexpected path changes than `verify-handles` alone, but will prolong migration downtime. ### Different Filesystem If the source and destination shared directory are not the exact same directory on the same filesystem, users must ensure their contents are equal at migration switch-over. For example, read-only configuration directories presented to the guest via virtio-fs can just be copied over to the destination ahead of migration. For such cases, use **`--migration-mode=find-paths`**. We also recommend the filesystem to be read-only, which can be reinforced with virtiofsd’s **`--readonly`** switch. If that is not possible, *take special care* to ensure source and destination directory contents match at the switch-over point in time! If, during migration, it is possible for the shared directory contents to be modified by a party other than the migrating virtiofsd instance, we strongly recommend using **`--migration-confirm-paths`**. Still, that is not a 100 % safe solution. So above all, for the case where source and destination instance do not use the same shared directory on the same (shared) filesystem, we strongly advise not to allow the shared directory to be modified at all during migration. virtiofsd-1.13.0/doc/xattr-mapping.md000064400000000000000000000172331046102023000156070ustar 00000000000000# Extended attribute (xattr) mapping By default, the name of xattrs used by the client are passed through to the server file system. This can be a problem where either those xattr names are used by something on the server (e.g. selinux client/server confusion) or if the virtiofsd is running in a container with restricted privileges where it cannot access some attributes. ## Mapping syntax A mapping of xattr names can be made using `--xattrmap=` where the `` string consists of a series of rules. When looking for a mapping, the first matching rule applies. There *must* be a mapping for every xattr name in the list of rules, for example by making the final rule a catch-all rule to match any remaining attributes. Each rule consists of a number of fields separated with a separator that is the first non-white space character in the rule. This separator must then be used for the whole rule. White space may be added before and after each rule. Using `:` as the separator a rule is of the form: ``` :type:scope:key:prepend: ``` **scope** is one of: - `client`: Match **key** against an xattr name from the client for setxattr/getxattr/removexattr - `server`: Match **prepend** against an xattr name from the server for listxattr - `all`: Can be used to make a single rule where both the server and client matches are triggered. **type** is one of: - `prefix`: Is designed to prepend and strip a prefix; the modified attributes then being passed on to the client/server. - `ok`: Causes the rule set to be terminated when a match is found while allowing matching xattrs through unchanged. It is intended both as a way of explicitly terminating the list of rules, and to allow some xattrs to skip following rules. - `bad`: If a client tries to use a name matching **key** it's denied using `EPERM`; when the server passes an attribute name matching **prepend** it's hidden. In many ways its use is very like the `ok` type as either an explicit terminator or for special handling of certain patterns. - `unsupported`: If a client tries to use a name matching **key** it's denied using `ENOTSUP`; when the server passes an attribute name matching **prepend** it's hidden. In many ways its use is very like the `ok` type as either an explicit terminator or for special handling of certain patterns. **key** is a string tested as a prefix on an attribute name originating on the client. It may be empty in which case a `client` scoped rule will always match on client names. **prepend** is a string tested as a prefix on an attribute name originating on the server, and used as a new prefix. It may be empty in which case a `server` scoped rule will always match on all names from the server. e.g.: | Mapping rule | Description | | ----------------------------------------- | ---------------------------------------------------------------------------------------------------- | | `:prefix:client:trusted.:user.virtiofs.:` | will match `trusted.*` attributes in client calls and prefix them before passing them to the server. | | `:prefix:server::user.virtiofs.:` | will strip `user.virtiofs.` from all server replies. | | `:prefix:all:trusted.:user.virtiofs.:` | combines the previous two cases into a single rule. | | `:ok:client:user.::` | will allow get/set xattr for `user.` xattrs. | | `:ok:server::security.:` | will pass `security.` xattrs in listxattr from the server. | | `:ok:all:::` | will terminate the rule search passing any remaining attributes in both directions. | | `:bad:server::security.:` | would hide `security.` xattrs in listxattr from the server. | A simpler **map** type provides a shorter syntax for the common case: ``` :map:key:prepend: ``` The `map` type adds a number of separate rules to add **prepend** as a prefix to the matched **key** (or all attributes if **key** is empty). There may be at most one `map` rule, and it must be the last rule in the set. Please note that when the `security.capability` xattr is remapped, the daemon has to do extra work to remove it during many operations, which the host kernel normally does itself. ## Security considerations Operating systems typically partition the xattr namespace using well-defined name prefixes. Each partition may have different access controls applied. For example, on Linux there are multiple partitions - `system.*`: access varies depending on attribute and filesystem - `security.*`: only processes with `CAP_SYS_ADMIN` - `trusted.*`: only processes with `CAP_SYS_ADMIN` - `user.*`: any process granted by file permissions / ownership While other OS such as FreeBSD have different name prefixes and access control rules. When remapping attributes on the host, it is important to ensure that the remapping does not allow a guest user to evade the guest access control rules. Consider if `trusted.*` from the guest was remapped to `user.virtiofs.trusted.*` in the host. An unprivileged user in a Linux guest has the ability to write to xattrs under `user.*`. Thus the user can evade the access control restriction on `trusted.*` by instead writing to `user.virtiofs.trusted.*`. As noted above, the partitions used and access controls applied, will vary across guest OS, so it is not wise to try to predict what the guest OS will use. The simplest way to avoid an insecure configuration is to remap all xattrs at once, to a given fixed prefix. This is shown in example (1) below. If selectively mapping only a subset of xattr prefixes, then rules must be added to explicitly block direct access to the target of the remapping. This is shown in example (2) below. ## Mapping examples 1. Prefix all attributes with `user.virtiofs.` ```shell --xattrmap=":prefix:all::user.virtiofs.::bad:all:::" ``` This uses two rules, using : as the field separator; the first rule prefixes and strips `user.virtiofs.`, the second rule hides any non-prefixed attributes that the host set. This is equivalent to the `map` rule: ```shell --xattrmap=":map::user.virtiofs.:" ``` 2. Prefix `trusted.` attributes, allow others through ```shell --xattrmap="/prefix/all/trusted./user.virtiofs./ /bad/server//trusted./ /bad/client/user.virtiofs.// /ok/all///" ``` (each rule is on a single line just for the sake of clarity) Here there are four rules, using `/` as the field separator, and also demonstrating that new lines can be included between rules. The first rule is the prefixing of `trusted.` and stripping of `user.virtiofs.`. The second rule hides unprefixed `trusted.` attributes on the host. The third rule stops a guest from explicitly setting the `user.virtiofs.` path directly to prevent access control bypass on the target of the earlier prefix remapping. Finally, the fourth rule lets all remaining attributes through. This is equivalent to the `map` rule: ```shell --xattrmap="/map/trusted./user.virtiofs./" ``` 3. Hide `security.` attributes, and allow everything else ```shell --xattrmap="/bad/all/security./security./ /ok/all///" ``` The first rule combines what could be separate client and server rules into a single `all` rule, matching `security.` in either client arguments or lists returned from the host. This prevents the client from seeing and/or setting any `security.` attributes on the server.virtiofsd-1.13.0/rustfmt.toml000064400000000000000000000000601046102023000143140ustar 00000000000000imports_granularity = "Module" edition = "2018" virtiofsd-1.13.0/src/descriptor_utils.rs000064400000000000000000001005751046102023000164620ustar 00000000000000// Copyright 2019 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use std::collections::VecDeque; use std::fmt::{self, Display}; use std::io::{self, Read, Write}; use std::mem::{size_of, MaybeUninit}; use std::ops::Deref; use std::ptr::copy_nonoverlapping; use std::{cmp, result}; use vhost_user_backend::bitmap::BitmapMmapRegion; use virtio_queue::DescriptorChain; use vm_memory::bitmap::{Bitmap, BitmapSlice}; use vm_memory::{ Address, ByteValued, GuestMemory, GuestMemoryError, GuestMemoryMmap, GuestMemoryRegion, VolatileMemory, VolatileMemoryError, VolatileSlice, }; use crate::file_traits::FileReadWriteAtVolatile; use crate::oslib; #[derive(Debug)] pub enum Error { DescriptorChainOverflow, FindMemoryRegion, GuestMemoryError(GuestMemoryError), InvalidChain, IoError(io::Error), SplitOutOfBounds(usize), VolatileMemoryError(VolatileMemoryError), } impl Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use self::Error::*; match self { DescriptorChainOverflow => write!( f, "the combined length of all the buffers in a `DescriptorChain` would overflow" ), FindMemoryRegion => write!(f, "no memory region for this address range"), GuestMemoryError(e) => write!(f, "descriptor guest memory error: {e}"), InvalidChain => write!(f, "invalid descriptor chain"), IoError(e) => write!(f, "descriptor I/O error: {e}"), SplitOutOfBounds(off) => write!(f, "`DescriptorChain` split is out of bounds: {off}"), VolatileMemoryError(e) => write!(f, "volatile memory error: {e}"), } } } pub type Result = result::Result; impl std::error::Error for Error {} #[derive(Clone)] struct DescriptorChainConsumer<'a, B> { buffers: VecDeque>, bytes_consumed: usize, } impl<'a, B: BitmapSlice> DescriptorChainConsumer<'a, B> { fn available_bytes(&self) -> usize { // This is guaranteed not to overflow because the total length of the chain // is checked during all creations of `DescriptorChainConsumer` (see // `Reader::new()` and `Writer::new()`). self.buffers .iter() .fold(0usize, |count, vs| count + vs.len()) } fn bytes_consumed(&self) -> usize { self.bytes_consumed } /// Consumes at most `count` bytes from the `DescriptorChain`. Callers must provide a function /// that takes a `&[VolatileSlice]` and returns the total number of bytes consumed. This /// function guarantees that the combined length of all the slices in the `&[VolatileSlice]` is /// less than or equal to `count`. /// /// # Errors /// /// If the provided function returns any error then no bytes are consumed from the buffer and /// the error is returned to the caller. fn consume(&mut self, count: usize, f: F) -> io::Result where F: FnOnce(&[&VolatileSlice]) -> io::Result, { let mut buflen = 0; let mut bufs = Vec::with_capacity(self.buffers.len()); for vs in &self.buffers { if buflen >= count { break; } bufs.push(vs); let rem = count - buflen; if rem < vs.len() { buflen += rem; } else { buflen += vs.len(); } } if bufs.is_empty() { return Ok(0); } let bytes_consumed = f(&bufs)?; // This can happen if a driver tricks a device into reading/writing more data than // fits in a `usize`. let total_bytes_consumed = self.bytes_consumed .checked_add(bytes_consumed) .ok_or_else(|| { io::Error::new(io::ErrorKind::InvalidData, Error::DescriptorChainOverflow) })?; let mut rem = bytes_consumed; while let Some(vs) = self.buffers.pop_front() { if rem < vs.len() { // Split the slice and push the remainder back into the buffer list. Safe because we // know that `rem` is not out of bounds due to the check and we checked the bounds // on `vs` when we added it to the buffer list. self.buffers.push_front(vs.offset(rem).unwrap()); break; } // No need for checked math because we know that `vs.size() <= rem`. rem -= vs.len(); } self.bytes_consumed = total_bytes_consumed; Ok(bytes_consumed) } fn split_at(&mut self, offset: usize) -> Result> { let mut rem = offset; let pos = self.buffers.iter().position(|vs| { if rem < vs.len() { true } else { rem -= vs.len(); false } }); if let Some(at) = pos { let mut other = self.buffers.split_off(at); if rem > 0 { // There must be at least one element in `other` because we checked // its `size` value in the call to `position` above. let front = other.pop_front().expect("empty VecDeque after split"); self.buffers .push_back(front.subslice(0, rem).map_err(Error::VolatileMemoryError)?); other.push_front(front.offset(rem).map_err(Error::VolatileMemoryError)?); } Ok(DescriptorChainConsumer { buffers: other, bytes_consumed: 0, }) } else if rem == 0 { Ok(DescriptorChainConsumer { buffers: VecDeque::new(), bytes_consumed: 0, }) } else { Err(Error::SplitOutOfBounds(offset)) } } } /// Provides high-level interface over the sequence of memory regions /// defined by readable descriptors in the descriptor chain. /// /// Note that virtio spec requires driver to place any device-writable /// descriptors after any device-readable descriptors (2.6.4.2 in Virtio Spec v1.1). /// Reader will skip iterating over descriptor chain when first writable /// descriptor is encountered. #[derive(Clone)] pub struct Reader<'a, B = BitmapMmapRegion> { buffer: DescriptorChainConsumer<'a, B>, } impl<'a, B: Bitmap + BitmapSlice + 'static> Reader<'a, B> { /// Construct a new Reader wrapper over `desc_chain`. pub fn new( mem: &'a GuestMemoryMmap, desc_chain: DescriptorChain, ) -> Result> where M: Deref, M::Target: GuestMemory + Sized, { let mut total_len: usize = 0; let buffers = desc_chain .readable() .map(|desc| { // Verify that summing the descriptor sizes does not overflow. // This can happen if a driver tricks a device into reading more data than // fits in a `usize`. total_len = total_len .checked_add(desc.len() as usize) .ok_or(Error::DescriptorChainOverflow)?; let region = mem .find_region(desc.addr()) .ok_or(Error::FindMemoryRegion)?; let offset = desc .addr() .checked_sub(region.start_addr().raw_value()) .unwrap(); region .deref() .get_slice(offset.raw_value() as usize, desc.len() as usize) .map_err(Error::VolatileMemoryError) }) .collect::>>>()?; Ok(Reader { buffer: DescriptorChainConsumer { buffers, bytes_consumed: 0, }, }) } /// Reads an object from the descriptor chain buffer. pub fn read_obj(&mut self) -> io::Result { let mut obj = MaybeUninit::::uninit(); // Safe because `MaybeUninit` guarantees that the pointer is valid for // `size_of::()` bytes. let buf = unsafe { ::std::slice::from_raw_parts_mut(obj.as_mut_ptr() as *mut u8, size_of::()) }; self.read_exact(buf)?; // Safe because any type that implements `ByteValued` can be considered initialized // even if it is filled with random data. Ok(unsafe { obj.assume_init() }) } /// Write data from the descriptor chain buffer into a File at offset `off`. /// /// Return the number of bytes written. This can be less than `count` if there isn't enough /// data in the descriptor chain buffer, or if the file’s write function performed a short /// write. pub fn write_to_file_at>( &mut self, dst: F, count: usize, off: u64, flags: Option, ) -> io::Result { self.buffer.consume(count, |bufs| { dst.write_vectored_at_volatile(bufs, off, flags) }) } /// Returns number of bytes available for reading. May return an error if the combined /// lengths of all the buffers in the DescriptorChain would cause an integer overflow. pub fn available_bytes(&self) -> usize { self.buffer.available_bytes() } /// Returns number of bytes already read from the descriptor chain buffer. pub fn bytes_read(&self) -> usize { self.buffer.bytes_consumed() } /// Splits this `Reader` into two at the given offset in the `DescriptorChain` buffer. /// After the split, `self` will be able to read up to `offset` bytes while the returned /// `Reader` can read up to `available_bytes() - offset` bytes. Returns an error if /// `offset > self.available_bytes()`. pub fn split_at(&mut self, offset: usize) -> Result> { self.buffer.split_at(offset).map(|buffer| Reader { buffer }) } } impl io::Read for Reader<'_, B> { fn read(&mut self, buf: &mut [u8]) -> io::Result { self.buffer.consume(buf.len(), |bufs| { let mut rem = buf; let mut total = 0; for vs in bufs { let copy_len = cmp::min(rem.len(), vs.len()); // SAFETY: Safe because we verify that we do not read outside // of the slice's bound. The slice guard will only get dropped // after the function returns. This will keep the pointer valid // while reads are happening. unsafe { copy_nonoverlapping(vs.ptr_guard().as_ptr(), rem.as_mut_ptr(), copy_len); } rem = &mut rem[copy_len..]; total += copy_len; } Ok(total) }) } } /// Provides high-level interface over the sequence of memory regions /// defined by writable descriptors in the descriptor chain. /// /// Note that virtio spec requires driver to place any device-writable /// descriptors after any device-readable descriptors (2.6.4.2 in Virtio Spec v1.1). /// Writer will start iterating the descriptors from the first writable one and will /// assume that all following descriptors are writable. #[derive(Clone)] pub struct Writer<'a, B = BitmapMmapRegion> { buffer: DescriptorChainConsumer<'a, B>, } impl<'a, B: Bitmap + BitmapSlice + 'static> Writer<'a, B> { /// Construct a new Writer wrapper over `desc_chain`. pub fn new( mem: &'a GuestMemoryMmap, desc_chain: DescriptorChain, ) -> Result> where M: Deref, M::Target: GuestMemory + Sized, { let mut total_len: usize = 0; let buffers = desc_chain .writable() .map(|desc| { // Verify that summing the descriptor sizes does not overflow. // This can happen if a driver tricks a device into writing more data than // fits in a `usize`. total_len = total_len .checked_add(desc.len() as usize) .ok_or(Error::DescriptorChainOverflow)?; let region = mem .find_region(desc.addr()) .ok_or(Error::FindMemoryRegion)?; let offset = desc .addr() .checked_sub(region.start_addr().raw_value()) .unwrap(); region .deref() .get_slice(offset.raw_value() as usize, desc.len() as usize) .map_err(Error::VolatileMemoryError) }) .collect::>>>()?; Ok(Writer { buffer: DescriptorChainConsumer { buffers, bytes_consumed: 0, }, }) } /// Writes an object to the descriptor chain buffer. pub fn write_obj(&mut self, val: T) -> io::Result<()> { self.write_all(val.as_slice()) } /// Returns number of bytes available for writing. May return an error if the combined /// lengths of all the buffers in the DescriptorChain would cause an overflow. pub fn available_bytes(&self) -> usize { self.buffer.available_bytes() } /// Read data into the descriptor chain buffer from a File at offset `off`. /// /// Return the number of bytes read. This can be less than `count` if there isn't enough data /// in the descriptor chain buffer, or if the file’s read function performed a short read. pub fn read_from_file_at>( &mut self, src: F, count: usize, off: u64, ) -> io::Result { self.buffer .consume(count, |bufs| src.read_vectored_at_volatile(bufs, off)) } /// Returns number of bytes already written to the descriptor chain buffer. pub fn bytes_written(&self) -> usize { self.buffer.bytes_consumed() } /// Splits this `Writer` into two at the given offset in the `DescriptorChain` buffer. /// After the split, `self` will be able to write up to `offset` bytes while the returned /// `Writer` can write up to `available_bytes() - offset` bytes. Returns an error if /// `offset > self.available_bytes()`. pub fn split_at(&mut self, offset: usize) -> Result> { self.buffer.split_at(offset).map(|buffer| Writer { buffer }) } } impl io::Write for Writer<'_, B> { fn write(&mut self, buf: &[u8]) -> io::Result { self.buffer.consume(buf.len(), |bufs| { let mut rem = buf; let mut total = 0; for vs in bufs { let copy_len = cmp::min(rem.len(), vs.len()); // SAFETY: Safe because we ensure that we do not write over the // slice's bounds. The slice guard will only get dropped after // the function returns. This will keep the pointer valid while // writes are happening. unsafe { copy_nonoverlapping(rem.as_ptr(), vs.ptr_guard_mut().as_ptr(), copy_len); } vs.bitmap().mark_dirty(0, copy_len); rem = &rem[copy_len..]; total += copy_len; } Ok(total) }) } fn flush(&mut self) -> io::Result<()> { // Nothing to flush since the writes go straight into the buffer. Ok(()) } } #[derive(Copy, Clone, PartialEq, Eq)] pub enum DescriptorType { Readable, Writable, } #[cfg(test)] mod tests { use super::*; use virtio_queue::{Queue, QueueOwnedT, QueueT}; use vm_memory::{Bytes, GuestAddress, Le16, Le32, Le64}; const VIRTQ_DESC_F_NEXT: u16 = 0x1; const VIRTQ_DESC_F_WRITE: u16 = 0x2; const MAX_QUEUE_SIZE: u16 = 32768; #[derive(Copy, Clone, Debug, Default)] #[repr(C)] struct virtq_desc { addr: Le64, len: Le32, flags: Le16, next: Le16, } // Safe because it only has data and has no implicit padding. unsafe impl ByteValued for virtq_desc {} #[derive(Copy, Clone, Debug, Default)] #[repr(C)] struct virtq_avail { flags: Le16, idx: Le16, ring: Le16, } // Safe because it only has data and has no implicit padding. unsafe impl ByteValued for virtq_avail {} /// Test utility function to create a descriptor chain in guest memory. pub fn create_descriptor_chain( memory: &GuestMemoryMmap, descriptor_array_addr: GuestAddress, mut buffers_start_addr: GuestAddress, descriptors: Vec<(DescriptorType, u32)>, spaces_between_regions: u32, ) -> Result> { let descriptors_len = descriptors.len(); for (index, (type_, size)) in descriptors.into_iter().enumerate() { let mut flags = 0; if let DescriptorType::Writable = type_ { flags |= VIRTQ_DESC_F_WRITE; } if index + 1 < descriptors_len { flags |= VIRTQ_DESC_F_NEXT; } let index = index as u16; let desc = virtq_desc { addr: buffers_start_addr.raw_value().into(), len: size.into(), flags: flags.into(), next: (index + 1).into(), }; let offset = size + spaces_between_regions; buffers_start_addr = buffers_start_addr .checked_add(u64::from(offset)) .ok_or(Error::InvalidChain)?; let _ = memory.write_obj( desc, descriptor_array_addr .checked_add(u64::from(index) * std::mem::size_of::() as u64) .ok_or(Error::InvalidChain)?, ); } let avail_ring = descriptor_array_addr .checked_add( u64::from(descriptors_len as u16) * std::mem::size_of::() as u64, ) .ok_or(Error::InvalidChain)?; let avail = virtq_avail { flags: 0.into(), idx: 1.into(), ring: 0.into(), }; let _ = memory.write_obj(avail, avail_ring); let mut queue: Queue = Queue::new(MAX_QUEUE_SIZE).unwrap(); queue .try_set_desc_table_address(descriptor_array_addr) .unwrap(); queue.try_set_avail_ring_address(avail_ring).unwrap(); queue.set_ready(true); let desc = queue.iter(memory).unwrap().next().unwrap(); Ok(desc.clone()) } #[test] fn reader_test_simple_chain() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![ (Readable, 8), (Readable, 16), (Readable, 18), (Readable, 64), ], 0, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain).expect("failed to create Reader"); assert_eq!(reader.available_bytes(), 106); assert_eq!(reader.bytes_read(), 0); let mut buffer = [0 as u8; 64]; if let Err(e) = reader.read_exact(&mut buffer) { panic!("read_exact should not fail here: {:?}", e); } assert_eq!(reader.available_bytes(), 42); assert_eq!(reader.bytes_read(), 64); match reader.read(&mut buffer) { Err(e) => panic!("read should not fail here: {:?}", e), Ok(length) => assert_eq!(length, 42), } assert_eq!(reader.available_bytes(), 0); assert_eq!(reader.bytes_read(), 106); } #[test] fn writer_test_simple_chain() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![ (Writable, 8), (Writable, 16), (Writable, 18), (Writable, 64), ], 0, ) .expect("create_descriptor_chain failed"); let mut writer = Writer::new(&memory, chain).expect("failed to create Writer"); assert_eq!(writer.available_bytes(), 106); assert_eq!(writer.bytes_written(), 0); let buffer = [0 as u8; 64]; if let Err(e) = writer.write_all(&buffer) { panic!("write_all should not fail here: {:?}", e); } assert_eq!(writer.available_bytes(), 42); assert_eq!(writer.bytes_written(), 64); match writer.write(&buffer) { Err(e) => panic!("write should not fail here {:?}", e), Ok(length) => assert_eq!(length, 42), } assert_eq!(writer.available_bytes(), 0); assert_eq!(writer.bytes_written(), 106); } #[test] fn reader_test_incompatible_chain() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![(Writable, 8)], 0, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain).expect("failed to create Reader"); assert_eq!(reader.available_bytes(), 0); assert_eq!(reader.bytes_read(), 0); assert!(reader.read_obj::().is_err()); assert_eq!(reader.available_bytes(), 0); assert_eq!(reader.bytes_read(), 0); } #[test] fn writer_test_incompatible_chain() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![(Readable, 8)], 0, ) .expect("create_descriptor_chain failed"); let mut writer = Writer::new(&memory, chain).expect("failed to create Writer"); assert_eq!(writer.available_bytes(), 0); assert_eq!(writer.bytes_written(), 0); assert!(writer.write_obj(0u8).is_err()); assert_eq!(writer.available_bytes(), 0); assert_eq!(writer.bytes_written(), 0); } #[test] fn reader_writer_shared_chain() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![ (Readable, 16), (Readable, 16), (Readable, 96), (Writable, 64), (Writable, 1), (Writable, 3), ], 0, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain.clone()).expect("failed to create Reader"); let mut writer = Writer::new(&memory, chain).expect("failed to create Writer"); assert_eq!(reader.bytes_read(), 0); assert_eq!(writer.bytes_written(), 0); let mut buffer = Vec::with_capacity(200); assert_eq!( reader .read_to_end(&mut buffer) .expect("read should not fail here"), 128 ); // The writable descriptors are only 68 bytes long. writer .write_all(&buffer[..68]) .expect("write should not fail here"); assert_eq!(reader.available_bytes(), 0); assert_eq!(reader.bytes_read(), 128); assert_eq!(writer.available_bytes(), 0); assert_eq!(writer.bytes_written(), 68); } #[test] fn reader_writer_shattered_object() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let secret: Le32 = 0x1234_5678.into(); // Create a descriptor chain with memory regions that are properly separated. let chain_writer = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![(Writable, 1), (Writable, 1), (Writable, 1), (Writable, 1)], 123, ) .expect("create_descriptor_chain failed"); let mut writer = Writer::new(&memory, chain_writer).expect("failed to create Writer"); if let Err(e) = writer.write_obj(secret) { panic!("write_obj should not fail here: {:?}", e); } // Now create new descriptor chain pointing to the same memory and try to read it. let chain_reader = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![(Readable, 1), (Readable, 1), (Readable, 1), (Readable, 1)], 123, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain_reader).expect("failed to create Reader"); match reader.read_obj::() { Err(e) => panic!("read_obj should not fail here: {:?}", e), Ok(read_secret) => assert_eq!(read_secret, secret), } } #[test] fn reader_unexpected_eof() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![(Readable, 256), (Readable, 256)], 0, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain).expect("failed to create Reader"); let mut buf = vec![0; 1024]; assert_eq!( reader .read_exact(&mut buf[..]) .expect_err("read more bytes than available") .kind(), io::ErrorKind::UnexpectedEof ); } #[test] fn split_border() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![ (Readable, 16), (Readable, 16), (Readable, 96), (Writable, 64), (Writable, 1), (Writable, 3), ], 0, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain).expect("failed to create Reader"); let other = reader.split_at(32).expect("failed to split Reader"); assert_eq!(reader.available_bytes(), 32); assert_eq!(other.available_bytes(), 96); } #[test] fn split_middle() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![ (Readable, 16), (Readable, 16), (Readable, 96), (Writable, 64), (Writable, 1), (Writable, 3), ], 0, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain).expect("failed to create Reader"); let other = reader.split_at(24).expect("failed to split Reader"); assert_eq!(reader.available_bytes(), 24); assert_eq!(other.available_bytes(), 104); } #[test] fn split_end() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![ (Readable, 16), (Readable, 16), (Readable, 96), (Writable, 64), (Writable, 1), (Writable, 3), ], 0, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain).expect("failed to create Reader"); let other = reader.split_at(128).expect("failed to split Reader"); assert_eq!(reader.available_bytes(), 128); assert_eq!(other.available_bytes(), 0); } #[test] fn split_beginning() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![ (Readable, 16), (Readable, 16), (Readable, 96), (Writable, 64), (Writable, 1), (Writable, 3), ], 0, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain).expect("failed to create Reader"); let other = reader.split_at(0).expect("failed to split Reader"); assert_eq!(reader.available_bytes(), 0); assert_eq!(other.available_bytes(), 128); } #[test] fn split_outofbounds() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![ (Readable, 16), (Readable, 16), (Readable, 96), (Writable, 64), (Writable, 1), (Writable, 3), ], 0, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain).expect("failed to create Reader"); if reader.split_at(256).is_ok() { panic!("successfully split Reader with out of bounds offset"); } } #[test] fn read_full() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![(Readable, 16), (Readable, 16), (Readable, 16)], 0, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain).expect("failed to create Reader"); let mut buf = vec![0u8; 64]; assert_eq!( reader.read(&mut buf[..]).expect("failed to read to buffer"), 48 ); } #[test] fn write_full() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![(Writable, 16), (Writable, 16), (Writable, 16)], 0, ) .expect("create_descriptor_chain failed"); let mut writer = Writer::new(&memory, chain).expect("failed to create Writer"); let buf = vec![0xdeu8; 64]; assert_eq!( writer.write(&buf[..]).expect("failed to write from buffer"), 48 ); } } virtiofsd-1.13.0/src/file_traits.rs000064400000000000000000000131141046102023000153610ustar 00000000000000// Copyright 2018 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use std::convert::TryInto; use std::fs::File; use std::io::{Error, Result}; use std::os::unix::io::{AsFd, AsRawFd}; use vm_memory::VolatileSlice; use crate::oslib; use libc::{c_int, c_void, off64_t, preadv64, size_t}; use vm_memory::bitmap::BitmapSlice; /// A trait for setting the size of a file. /// This is equivalent to File's `set_len` method, but /// wrapped in a trait so that it can be implemented for /// other types. pub trait FileSetLen { // Set the size of this file. // This is the moral equivalent of `ftruncate()`. fn set_len(&self, _len: u64) -> Result<()>; } impl FileSetLen for File { fn set_len(&self, len: u64) -> Result<()> { File::set_len(self, len) } } /// A trait similar to the unix `ReadExt` and `WriteExt` traits, but for volatile memory. pub trait FileReadWriteAtVolatile { /// Reads bytes from this file at `offset` into the given slice of buffers, returning the number /// of bytes read on success. Data is copied to fill each buffer in order, with the final buffer /// written to possibly being only partially filled. fn read_vectored_at_volatile(&self, bufs: &[&VolatileSlice], offset: u64) -> Result; /// Writes bytes to this file at `offset` from the given slice of buffers, returning the number /// of bytes written on success. Data is copied from each buffer in order, with the final buffer /// read from possibly being only partially consumed. fn write_vectored_at_volatile( &self, bufs: &[&VolatileSlice], offset: u64, flags: Option, ) -> Result; } impl + ?Sized> FileReadWriteAtVolatile for &T { fn read_vectored_at_volatile(&self, bufs: &[&VolatileSlice], offset: u64) -> Result { (**self).read_vectored_at_volatile(bufs, offset) } fn write_vectored_at_volatile( &self, bufs: &[&VolatileSlice], offset: u64, flags: Option, ) -> Result { (**self).write_vectored_at_volatile(bufs, offset, flags) } } macro_rules! volatile_impl { ($ty:ty) => { impl FileReadWriteAtVolatile for $ty { fn read_vectored_at_volatile( &self, bufs: &[&VolatileSlice], offset: u64, ) -> Result { let slice_guards: Vec<_> = bufs.iter().map(|s| s.ptr_guard_mut()).collect(); let iovecs: Vec = slice_guards .iter() .map(|s| libc::iovec { iov_base: s.as_ptr() as *mut c_void, iov_len: s.len() as size_t, }) .collect(); if iovecs.is_empty() { return Ok(0); } // SAFETY: Safe because only bytes inside the buffers are // accessed and the kernel is expected to handle arbitrary // memory for I/O. The pointers into the slice are valid since // the slice_guards are still in scope. let ret = unsafe { preadv64( self.as_raw_fd(), &iovecs[0], iovecs.len() as c_int, offset as off64_t, ) }; if ret >= 0 { let mut total = 0; for vs in bufs { // Each `VolatileSlice` has a "local" bitmap (i.e., the offset 0 in the // bitmap corresponds to the beginning of the `VolatileSlice`) vs.bitmap() .mark_dirty(0, std::cmp::min(ret as usize - total, vs.len())); total += vs.len(); if total >= ret as usize { break; } } Ok(ret as usize) } else { Err(Error::last_os_error()) } } fn write_vectored_at_volatile( &self, bufs: &[&VolatileSlice], offset: u64, flags: Option, ) -> Result { let slice_guards: Vec<_> = bufs.iter().map(|s| s.ptr_guard()).collect(); let iovecs: Vec = slice_guards .iter() .map(|s| libc::iovec { iov_base: s.as_ptr() as *mut c_void, iov_len: s.len() as size_t, }) .collect(); if iovecs.is_empty() { return Ok(0); } // SAFETY: Each `libc::iovec` element is created from a // `VolatileSlice` of the guest memory. The pointers are valid // because the slice guards are still in scope. We also ensure // that we do not read over the slice bounds. unsafe { oslib::writev_at( self.as_fd(), iovecs.as_slice(), offset.try_into().unwrap(), flags, ) } } } }; } volatile_impl!(File); virtiofsd-1.13.0/src/filesystem.rs000064400000000000000000001316751046102023000152550ustar 00000000000000// Copyright 2019 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use std::ffi::{CStr, CString}; use std::fs::File; use std::sync::atomic::AtomicBool; use std::sync::Arc; use std::time::Duration; use std::{io, mem}; use crate::soft_idmap::{GuestGid, GuestUid}; use crate::{fuse, oslib}; pub use fuse::{FsOptions, OpenOptions, RemovemappingOne, SetattrValid, SetxattrFlags, ROOT_ID}; /// Information about a path in the filesystem. pub struct Entry { /// An `Inode` that uniquely identifies this path. During `lookup`, setting this to `0` means a /// negative entry. Returning `ENOENT` also means a negative entry but setting this to `0` /// allows the kernel to cache the negative result for `entry_timeout`. The value should be /// produced by converting a `FileSystem::Inode` into a `u64`. pub inode: u64, /// The generation number for this `Entry`. Typically used for network file systems. An `inode` /// / `generation` pair must be unique over the lifetime of the file system (rather than just /// the lifetime of the mount). In other words, if a `FileSystem` implementation re-uses an /// `Inode` after it has been deleted then it must assign a new, previously unused generation /// number to the `Inode` at the same time. pub generation: u64, /// Inode attributes. Even if `attr_timeout` is zero, `attr` must be correct. For example, for /// `open()`, FUSE uses `attr.st_size` from `lookup()` to determine how many bytes to request. /// If this value is not correct, incorrect data will be returned. pub attr: fuse::Attr, /// How long the values in `attr` should be considered valid. If the attributes of the `Entry` /// are only modified by the FUSE client, then this should be set to a very large value. pub attr_timeout: Duration, /// How long the name associated with this `Entry` should be considered valid. If directory /// entries are only changed or deleted by the FUSE client, then this should be set to a very /// large value. pub entry_timeout: Duration, } impl From for fuse::EntryOut { fn from(entry: Entry) -> fuse::EntryOut { fuse::EntryOut { nodeid: entry.inode, generation: entry.generation, entry_valid: entry.entry_timeout.as_secs(), attr_valid: entry.attr_timeout.as_secs(), entry_valid_nsec: entry.entry_timeout.subsec_nanos(), attr_valid_nsec: entry.attr_timeout.subsec_nanos(), attr: entry.attr, } } } /// Represents information about an entry in a directory. pub struct DirEntry<'a> { /// The inode number for this entry. This does NOT have to be the same as the `Inode` for this /// directory entry. However, it must be the same as the `attr.st_ino` field of the `Entry` that /// would be returned by a `lookup` request in the parent directory for `name`. pub ino: libc::ino64_t, /// Any non-zero value that the kernel can use to identify the current point in the directory /// entry stream. It does not need to be the actual physical position. A value of `0` is /// reserved to mean "from the beginning" and should never be used. The `offset` value of the /// first entry in a stream should point to the beginning of the second entry and so on. pub offset: u64, /// The type of this directory entry. Valid values are any of the `libc::DT_*` constants. pub type_: u32, /// The name of this directory entry. There are no requirements for the contents of this field /// and any sequence of bytes is considered valid. pub name: &'a CStr, } /// A reply to a `getxattr` method call. pub enum GetxattrReply { /// The value of the requested extended attribute. This can be arbitrary textual or binary data /// and does not need to be nul-terminated. Value(Vec), /// The size of the buffer needed to hold the value of the requested extended attribute. Should /// be returned when the `size` parameter is 0. Callers should note that it is still possible /// for the size of the value to change in between `getxattr` calls and should not assume that a /// subsequent call to `getxattr` with the returned count will always succeed. Count(u32), } /// A reply to a `listxattr` method call. pub enum ListxattrReply { /// A buffer containing a nul-separated list of the names of all the extended attributes /// associated with this `Inode`. This list of names may be unordered and includes a namespace /// prefix. There may be several disjoint namespaces associated with a single `Inode`. Names(Vec), /// This size of the buffer needed to hold the full list of extended attribute names associated /// with this `Inode`. Should be returned when the `size` parameter is 0. Callers should note /// that it is still possible for the set of extended attributes to change between `listxattr` /// calls and so should not assume that a subsequent call to `listxattr` with the returned count /// will always succeed. Count(u32), } /// A trait for directly copying data from the fuse transport into a `File` without first storing it /// in an intermediate buffer. pub trait ZeroCopyReader { /// Copies at most `count` bytes from `self` directly into `f` at offset `off` without storing /// it in any intermediate buffers. If the return value is `Ok(n)` then it must be guaranteed /// that `0 <= n <= count`. If `n` is `0`, then it can indicate one of 3 possibilities: /// /// 1. There is no more data left in `self`. /// 2. There is no more space in `f`. /// 3. `count` was `0`. /// /// Does not do short writes, unless one of the above happens; i.e. will invoke the underlying /// file write function until `count` bytes have been written or it returns 0 (case 2 above) or /// `self` is empty (case 1 above). fn write_to_file_at( &mut self, f: &File, count: usize, off: u64, flags: Option, ) -> io::Result; } impl ZeroCopyReader for &mut R { fn write_to_file_at( &mut self, f: &File, count: usize, off: u64, flags: Option, ) -> io::Result { (**self).write_to_file_at(f, count, off, flags) } } /// A trait for directly copying data from a `File` into the fuse transport without first storing /// it in an intermediate buffer. pub trait ZeroCopyWriter { /// Copies at most `count` bytes from `f` at offset `off` directly into `self` without storing /// it in any intermediate buffers. If the return value is `Ok(n)` then it must be guaranteed /// that `0 <= n <= count`. If `n` is `0`, then it can indicate one of 4 possibilities: /// /// 1. There is no more data left in `f`. /// 2. End of `f` reached. /// 3. There is no more space in `self`. /// 4. `count` was `0`. /// /// Does not do short reads, unless one of the above happens; i.e. will invoke the underlying /// file read function until `count` bytes have been read or it returns 0 (cases 1 or 2 above) /// or `self` has no more space (case 2 above). fn read_from_file_at(&mut self, f: &File, count: usize, off: u64) -> io::Result; } impl ZeroCopyWriter for &mut W { fn read_from_file_at(&mut self, f: &File, count: usize, off: u64) -> io::Result { (**self).read_from_file_at(f, count, off) } } /// Additional context associated with requests. #[derive(Clone, Copy, Debug)] pub struct Context { /// The user ID of the calling process. pub uid: GuestUid, /// The group ID of the calling process. pub gid: GuestGid, /// The thread group ID of the calling process. pub pid: libc::pid_t, } impl From for Context { fn from(source: fuse::InHeader) -> Self { Context { uid: source.uid, gid: source.gid, pid: source.pid as i32, } } } /// Request extensions #[derive(Clone, Default, Debug)] pub struct Extensions { pub secctx: Option, pub sup_gid: Option, } /// Additional security context associated with requests. #[derive(Clone, Debug, Default)] pub struct SecContext { /// Name of security context pub name: CString, /// Actual security context pub secctx: Vec, } /// A trait for iterating over the contents of a directory. This trait is needed because rust /// doesn't support generic associated types, which means that it's not possible to implement a /// regular iterator that yields a `DirEntry` due to its generic lifetime parameter. pub trait DirectoryIterator { /// Returns the next entry in the directory or `None` if there are no more. fn next(&mut self) -> Option; } /// The main trait that connects a file system with a transport. #[allow(unused_variables)] pub trait FileSystem { /// Represents a location in the filesystem tree and can be used to perform operations that act /// on the metadata of a file/directory (e.g., `getattr` and `setattr`). Can also be used as the /// starting point for looking up paths in the filesystem tree. An `Inode` may support operating /// directly on the content of the path that to which it points. `FileSystem` implementations /// that support this should set the `FsOptions::ZERO_MESSAGE_OPEN` option in the return value /// of the `init` function. On linux based systems, an `Inode` is equivalent to opening a file /// or directory with the `libc::O_PATH` flag. /// /// # Lookup Count /// /// The `FileSystem` implementation is required to keep a "lookup count" for every `Inode`. /// Every time an `Entry` is returned by a `FileSystem` trait method, this lookup count should /// increase by 1. The lookup count for an `Inode` decreases when the kernel sends a `forget` /// request. `Inode`s with a non-zero lookup count may receive requests from the kernel even /// after calls to `unlink`, `rmdir` or (when overwriting an existing file) `rename`. /// `FileSystem` implementations must handle such requests properly and it is recommended to /// defer removal of the `Inode` until the lookup count reaches zero. Calls to `unlink`, `rmdir` /// or `rename` will be followed closely by `forget` unless the file or directory is open, in /// which case the kernel issues `forget` only after the `release` or `releasedir` calls. /// /// Note that if a file system will be exported over NFS the `Inode`'s lifetime must extend even /// beyond `forget`. See the `generation` field in `Entry`. type Inode: From + Into; /// Represents a file or directory that is open for reading/writing. type Handle: From + Into; /// An iterator over the entries of a directory. See the documentation for `readdir` for more /// details. type DirIter: DirectoryIterator; /// Initialize the file system. /// /// This method is called when a connection to the FUSE kernel module is first established. The /// `capable` parameter indicates the features that are supported by the kernel module. The /// implementation should return the options that it supports. Any options set in the returned /// `FsOptions` that are not also set in `capable` are silently dropped. fn init(&self, capable: FsOptions) -> io::Result { Ok(FsOptions::empty()) } /// Clean up the file system. /// /// Called when the filesystem exits. All open `Handle`s should be closed and the lookup count /// for all open `Inode`s implicitly goes to zero. At this point the connection to the FUSE /// kernel module may already be gone so implementations should not rely on being able to /// communicate with the kernel. fn destroy(&self) {} /// Look up a directory entry by name and get its attributes. /// /// If this call is successful then the lookup count of the `Inode` associated with the returned /// `Entry` must be increased by 1. fn lookup(&self, ctx: Context, parent: Self::Inode, name: &CStr) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Forget about an inode. /// /// Called when the kernel removes an inode from its internal caches. `count` indicates the /// amount by which the lookup count for the inode should be decreased. If reducing the lookup /// count by `count` causes it to go to zero, then the implementation may delete the `Inode`. fn forget(&self, ctx: Context, inode: Self::Inode, count: u64) {} /// Forget about multiple inodes. /// /// `requests` is a vector of `(inode, count)` pairs. See the documentation for `forget` for /// more information. fn batch_forget(&self, ctx: Context, requests: Vec<(Self::Inode, u64)>) { for (inode, count) in requests { self.forget(ctx, inode, count) } } /// Get attributes for a file / directory. /// /// If `handle` is not `None`, then it contains the handle previously returned by the /// implementation after a call to `open` or `opendir`. However, implementations should still /// take care to verify the handle if they do not trust the client (e.g., virtio-fs). /// /// If writeback caching is enabled (`FsOptions::WRITEBACK_CACHE`), then the kernel module /// likely has a better idea of the length of the file than the file system (for /// example, if there was a write that extended the size of the file but has not yet been /// flushed). In this case, the `st_size` field of the returned struct is ignored. /// /// The returned `Duration` indicates how long the returned attributes should be considered /// valid by the client. If the attributes are only changed via the FUSE kernel module (i.e., /// the kernel module has exclusive access), then this should be a very large value. fn getattr( &self, ctx: Context, inode: Self::Inode, handle: Option, ) -> io::Result<(fuse::Attr, Duration)> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Set attributes for a file / directory. /// /// If `handle` is not `None`, then it contains the handle previously returned by the /// implementation after a call to `open` or `opendir`. However, implementations should still /// take care to verify the handle if they do not trust the client (e.g., virtio-fs). /// /// The `valid` parameter indicates the fields of `attr` that may be considered valid and should /// be set by the file system. The content of all other fields in `attr` is undefined. /// /// If the `FsOptions::HANDLE_KILLPRIV_V2` was set during `init`, then the implementation is /// expected to reset the setuid and setgid bits if the file size or owner is being changed. /// /// This method returns the new attributes after making the modifications requested by the /// client. The returned `Duration` indicates how long the returned attributes should be /// considered valid by the client. If the attributes are only changed via the FUSE kernel /// module (i.e., the kernel module has exclusive access), then this should be a very large /// value. fn setattr( &self, ctx: Context, inode: Self::Inode, attr: fuse::SetattrIn, handle: Option, valid: SetattrValid, ) -> io::Result<(fuse::Attr, Duration)> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Read a symbolic link. fn readlink(&self, ctx: Context, inode: Self::Inode) -> io::Result> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Create a symbolic link. /// /// The file system must create a symbolic link named `name` in the directory represented by /// `parent`, which contains the string `linkname`. Returns an `Entry` for the newly created /// symlink. /// /// If this call is successful then the lookup count of the `Inode` associated with the returned /// `Entry` must be increased by 1. fn symlink( &self, ctx: Context, linkname: &CStr, parent: Self::Inode, name: &CStr, extensions: Extensions, ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Create a file node. /// /// Create a regular file, character device, block device, fifo, or socket node named `name` in /// the directory represented by `inode`. Valid values for `mode` and `rdev` are the same as /// those accepted by the `mknod(2)` system call. Returns an `Entry` for the newly created node. /// /// When the `FsOptions::DONT_MASK` feature is set, the file system is responsible for setting /// the permissions of the created node to `mode & !umask`. /// /// If this call is successful then the lookup count of the `Inode` associated with the returned /// `Entry` must be increased by 1. #[allow(clippy::too_many_arguments)] fn mknod( &self, ctx: Context, inode: Self::Inode, name: &CStr, mode: u32, rdev: u32, umask: u32, extensions: Extensions, ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Create a directory. /// /// When the `FsOptions::DONT_MASK` feature is set, the file system is responsible for setting /// the permissions of the created directory to `mode & !umask`. Returns an `Entry` for the /// newly created directory. /// /// If this call is successful then the lookup count of the `Inode` associated with the returned /// `Entry` must be increased by 1. fn mkdir( &self, ctx: Context, parent: Self::Inode, name: &CStr, mode: u32, umask: u32, extensions: Extensions, ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Remove a file. /// /// If the file's inode lookup count is non-zero, then the file system is expected to delay /// removal of the inode until the lookup count goes to zero. See the documentation of the /// `forget` function for more information. fn unlink(&self, ctx: Context, parent: Self::Inode, name: &CStr) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Remove a directory. /// /// If the directory's inode lookup count is non-zero, then the file system is expected to delay /// removal of the inode until the lookup count goes to zero. See the documentation of the /// `forget` function for more information. fn rmdir(&self, ctx: Context, parent: Self::Inode, name: &CStr) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Rename a file / directory. /// /// If the destination exists, it should be atomically replaced. If the destination's inode /// lookup count is non-zero, then the file system is expected to delay removal of the inode /// until the lookup count goes to zero. See the documentation of the `forget` function for more /// information. /// /// `flags` may be `libc::RENAME_EXCHANGE` or `libc::RENAME_NOREPLACE`. If /// `libc::RENAME_NOREPLACE` is specified, the implementation must not overwrite `newname` if it /// exists and must return an error instead. If `libc::RENAME_EXCHANGE` is specified, the /// implementation must atomically exchange the two files, i.e., both must exist and neither may /// be deleted. fn rename( &self, ctx: Context, olddir: Self::Inode, oldname: &CStr, newdir: Self::Inode, newname: &CStr, flags: u32, ) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Create a hard link. /// /// Create a hard link from `inode` to `newname` in the directory represented by `newparent`. /// /// If this call is successful then the lookup count of the `Inode` associated with the returned /// `Entry` must be increased by 1. fn link( &self, ctx: Context, inode: Self::Inode, newparent: Self::Inode, newname: &CStr, ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Open a file. /// /// Open the file associated with `inode` for reading / writing. All values accepted by the /// `open(2)` system call are valid values for `flags` and must be handled by the file system. /// However, there are some additional rules: /// /// * Creation flags (`libc::O_CREAT`, `libc::O_EXCL`, `libc::O_NOCTTY`) will be filtered out /// and handled by the kernel. /// /// * The file system should check the access modes (`libc::O_RDONLY`, `libc::O_WRONLY`, /// `libc::O_RDWR`) to determine if the operation is permitted. If the file system was mounted /// with the `-o default_permissions` mount option, then this check will also be carried out /// by the kernel before sending the open request. /// /// * When writeback caching is enabled (`FsOptions::WRITEBACK_CACHE`) the kernel may send read /// requests even for files opened with `libc::O_WRONLY`. The file system should be prepared /// to handle this. /// /// * When writeback caching is enabled, the kernel will handle the `libc::O_APPEND` flag. /// However, this will not work reliably unless the kernel has exclusive access to the file. /// In this case the file system may either ignore the `libc::O_APPEND` flag or return an /// error to indicate that reliable `libc::O_APPEND` handling is not available. /// /// * When writeback caching is disabled, the file system is expected to properly handle /// `libc::O_APPEND` and ensure that each write is appended to the end of the file. /// /// The file system may choose to return a `Handle` to refer to the newly opened file. The /// kernel will then use this `Handle` for all operations on the content of the file (`read`, /// `write`, `flush`, `release`, `fsync`). If the file system does not return a /// `Handle` then the kernel will use the `Inode` for the file to operate on its contents. In /// this case the file system may wish to enable the `FsOptions::ZERO_MESSAGE_OPEN` feature if /// it is supported by the kernel (see below). /// /// The returned `OpenOptions` allow the file system to change the way the opened file is /// handled by the kernel. See the documentation of `OpenOptions` for more information. /// /// If `kill_priv` is true then it indicates that the file system is expected to clear the /// setuid and setgid bits. /// /// If the `FsOptions::ZERO_MESSAGE_OPEN` feature is enabled by both the file system /// implementation and the kernel, then the file system may return an error of `ENOSYS`. This /// will be interpreted by the kernel as success and future calls to `open` and `release` will /// be handled by the kernel without being passed on to the file system. fn open( &self, ctx: Context, inode: Self::Inode, kill_priv: bool, flags: u32, ) -> io::Result<(Option, OpenOptions)> { // Matches the behavior of libfuse. Ok((None, OpenOptions::empty())) } /// Create and open a file. /// /// If the file does not already exist, the file system should create it with the specified /// `mode`. When the `FsOptions::DONT_MASK` feature is set, the file system is responsible for /// setting the permissions of the created file to `mode & !umask`. /// /// If `kill_priv` is true then it indicates that the file system is expected to clear the /// setuid and setgid bits. /// /// If the file system returns an `ENOSYS` error, then the kernel will treat this method as /// unimplemented and all future calls to `create` will be handled by calling the `mknod` and /// `open` methods instead. /// /// See the documentation for the `open` method for more information about opening the file. In /// addition to the optional `Handle` and the `OpenOptions`, the file system must also return an /// `Entry` for the file. This increases the lookup count for the `Inode` associated with the /// file by 1. #[allow(clippy::too_many_arguments)] fn create( &self, ctx: Context, parent: Self::Inode, name: &CStr, mode: u32, kill_priv: bool, flags: u32, umask: u32, extensions: Extensions, ) -> io::Result<(Entry, Option, OpenOptions)> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Read data from a file. /// /// Returns `size` bytes of data starting from offset `off` from the file associated with /// `inode` or `handle`. /// /// `flags` contains the flags used to open the file. Similarly, `handle` is the `Handle` /// returned by the file system from the `open` method, if any. If the file system /// implementation did not return a `Handle` from `open` then the contents of `handle` are /// undefined. /// /// This method should return exactly the number of bytes requested by the kernel, except in the /// case of error or EOF. Otherwise, the kernel will substitute the rest of the data with /// zeroes. An exception to this rule is if the file was opened with the "direct I/O" option /// (`libc::O_DIRECT`), in which case the kernel will forward the return code from this method /// to the userspace application that made the system call. #[allow(clippy::too_many_arguments)] fn read( &self, ctx: Context, inode: Self::Inode, handle: Self::Handle, w: W, size: u32, offset: u64, lock_owner: Option, flags: u32, ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Write data to a file. /// /// Writes `size` bytes of data starting from offset `off` to the file associated with `inode` /// or `handle`. /// /// `flags` contains the flags used to open the file. Similarly, `handle` is the `Handle` /// returned by the file system from the `open` method, if any. If the file system /// implementation did not return a `Handle` from `open` then the contents of `handle` are /// undefined. /// /// If `delayed_write` is true then it indicates that this is a write for buffered data. /// /// If `kill_priv` is true then it indicates that the file system is expected to clear the /// setuid and setgid bits. /// /// This method should return exactly the number of bytes requested by the kernel, except in the /// case of error. An exception to this rule is if the file was opened with the "direct I/O" /// option (`libc::O_DIRECT`), in which case the kernel will forward the return code from this /// method to the userspace application that made the system call. #[allow(clippy::too_many_arguments)] fn write( &self, ctx: Context, inode: Self::Inode, handle: Self::Handle, r: R, size: u32, offset: u64, lock_owner: Option, delayed_write: bool, kill_priv: bool, flags: u32, ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Flush the contents of a file. /// /// This method is called on every `close()` of a file descriptor. Since it is possible to /// duplicate file descriptors there may be many `flush` calls for one call to `open`. /// /// File systems should not make any assumptions about when `flush` will be /// called or even if it will be called at all. /// /// `handle` is the `Handle` returned by the file system from the `open` method, if any. If the /// file system did not return a `Handle` from `open` then the contents of `handle` are /// undefined. /// /// Unlike `fsync`, the file system is not required to flush pending writes. One reason to flush /// data is if the file system wants to return write errors during close. However, this is not /// portable because POSIX does not require `close` to wait for delayed I/O to complete. /// /// If the `FsOptions::POSIX_LOCKS` feature is enabled, then the file system must remove all /// locks belonging to `lock_owner`. /// /// If this method returns an `ENOSYS` error then the kernel will treat it as success and all /// subsequent calls to `flush` will be handled by the kernel without being forwarded to the /// file system. fn flush( &self, ctx: Context, inode: Self::Inode, handle: Self::Handle, lock_owner: u64, ) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Synchronize file contents. /// /// File systems must ensure that the file contents have been flushed to disk before returning /// from this method. If `datasync` is true then only the file data (but not the metadata) needs /// to be flushed. /// /// `handle` is the `Handle` returned by the file system from the `open` method, if any. If the /// file system did not return a `Handle` from `open` then the contents of /// `handle` are undefined. /// /// If this method returns an `ENOSYS` error then the kernel will treat it as success and all /// subsequent calls to `fsync` will be handled by the kernel without being forwarded to the /// file system. fn fsync( &self, ctx: Context, inode: Self::Inode, datasync: bool, handle: Self::Handle, ) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Allocate requested space for file data. /// /// If this function returns success, then the file system must guarantee that it is possible to /// write up to `length` bytes of data starting at `offset` without failing due to a lack of /// free space on the disk. /// /// `handle` is the `Handle` returned by the file system from the `open` method, if any. If the /// file system did not return a `Handle` from `open` then the contents of `handle` are /// undefined. /// /// If this method returns an `ENOSYS` error then the kernel will treat that as a permanent /// failure: all future calls to `fallocate` will fail with `EOPNOTSUPP` without being forwarded /// to the file system. fn fallocate( &self, ctx: Context, inode: Self::Inode, handle: Self::Handle, mode: u32, offset: u64, length: u64, ) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Release an open file. /// /// This method is called when there are no more references to an open file: all file /// descriptors are closed and all memory mappings are unmapped. /// /// For every `open` call there will be exactly one `release` call (unless the file system is /// force-unmounted). /// /// The file system may reply with an error, but error values are not returned to the `close()` /// or `munmap()` which triggered the release. /// /// `handle` is the `Handle` returned by the file system from the `open` method, if any. If the /// file system did not return a `Handle` from `open` then the contents of /// `handle` are undefined. /// /// If `flush` is `true` then the contents of the file should also be flushed to disk. #[allow(clippy::too_many_arguments)] fn release( &self, ctx: Context, inode: Self::Inode, flags: u32, handle: Self::Handle, flush: bool, flock_release: bool, lock_owner: Option, ) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Get information about the file system. fn statfs(&self, ctx: Context, inode: Self::Inode) -> io::Result { // Safe because we are zero-initializing a struct with only POD fields. let mut st: libc::statvfs64 = unsafe { mem::zeroed() }; // This matches the behavior of libfuse as it returns these values if the // filesystem doesn't implement this method. st.f_namemax = 255; st.f_bsize = 512; Ok(st) } /// Set an extended attribute. /// /// If this method fails with an `ENOSYS` error, then the kernel will treat that as a permanent /// failure. The kernel will return `EOPNOTSUPP` for all future calls to `setxattr` without /// forwarding them to the file system. /// /// Valid values for flags are the same as those accepted by the `setxattr(2)` system call and /// have the same behavior. fn setxattr( &self, ctx: Context, inode: Self::Inode, name: &CStr, value: &[u8], flags: u32, extra_flags: SetxattrFlags, ) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Get an extended attribute. /// /// If `size` is 0, then the file system should respond with `GetxattrReply::Count` and the /// number of bytes needed to hold the value. If `size` is large enough to hold the value, then /// the file system should reply with `GetxattrReply::Value` and the value of the extended /// attribute. If `size` is not 0 but is also not large enough to hold the value, then the file /// system should reply with an `ERANGE` error. /// /// If this method fails with an `ENOSYS` error, then the kernel will treat that as a permanent /// failure. The kernel will return `EOPNOTSUPP` for all future calls to `getxattr` without /// forwarding them to the file system. fn getxattr( &self, ctx: Context, inode: Self::Inode, name: &CStr, size: u32, ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// List extended attribute names. /// /// If `size` is 0, then the file system should respond with `ListxattrReply::Count` and the /// number of bytes needed to hold a `\0` byte separated list of the names of all the extended /// attributes. If `size` is large enough to hold the `\0` byte separated list of the attribute /// names, then the file system should reply with `ListxattrReply::Names` and the list. If /// `size` is not 0 but is also not large enough to hold the list, then the file system should /// reply with an `ERANGE` error. /// /// If this method fails with an `ENOSYS` error, then the kernel will treat that as a permanent /// failure. The kernel will return `EOPNOTSUPP` for all future calls to `listxattr` without /// forwarding them to the file system. fn listxattr(&self, ctx: Context, inode: Self::Inode, size: u32) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Remove an extended attribute. /// /// If this method fails with an `ENOSYS` error, then the kernel will treat that as a permanent /// failure. The kernel will return `EOPNOTSUPP` for all future calls to `removexattr` without /// forwarding them to the file system. fn removexattr(&self, ctx: Context, inode: Self::Inode, name: &CStr) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Open a directory for reading. /// /// The file system may choose to return a `Handle` to refer to the newly opened directory. The /// kernel will then use this `Handle` for all operations on the content of the directory /// (`readdir`, `readdirplus`, `fsyncdir`, `releasedir`). If the file system does not return a /// `Handle` then the kernel will use the `Inode` for the directory to operate on its contents. /// In this case the file system may wish to enable the `FsOptions::ZERO_MESSAGE_OPENDIR` /// feature if it is supported by the kernel (see below). /// /// The returned `OpenOptions` allow the file system to change the way the opened directory is /// handled by the kernel. See the documentation of `OpenOptions` for more information. /// /// If the `FsOptions::ZERO_MESSAGE_OPENDIR` feature is enabled by both the file system /// implementation and the kernel, then the file system may return an error of `ENOSYS`. This /// will be interpreted by the kernel as success and future calls to `opendir` and `releasedir` /// will be handled by the kernel without being passed on to the file system. fn opendir( &self, ctx: Context, inode: Self::Inode, flags: u32, ) -> io::Result<(Option, OpenOptions)> { // Matches the behavior of libfuse. Ok((None, OpenOptions::empty())) } /// Read a directory. /// /// `handle` is the `Handle` returned by the file system from the `opendir` method, if any. If /// the file system did not return a `Handle` from `opendir` then the contents of `handle` are /// undefined. /// /// `size` indicates the maximum number of bytes that should be returned by this method. /// /// If `offset` is non-zero then it corresponds to one of the `offset` values from a `DirEntry` /// that was previously returned by a call to `readdir` for the same handle. In this case the /// file system should skip over the entries before the position defined by the `offset` value. /// If entries were added or removed while the `Handle` is open then the file system may still /// include removed entries or skip newly created entries. However, adding or removing entries /// should never cause the file system to skip over unrelated entries or include an entry more /// than once. This means that `offset` cannot be a simple index and must include sufficient /// information to uniquely determine the next entry in the list even when the set of entries is /// being changed. /// /// The file system may return entries for the current directory (".") and parent directory /// ("..") but is not required to do so. If the file system does not return these entries, then /// they are implicitly added by the kernel. /// /// The lookup count for `Inode`s associated with the returned directory entries is **NOT** /// affected by this method. /// fn readdir( &self, ctx: Context, inode: Self::Inode, handle: Self::Handle, size: u32, offset: u64, ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Synchronize the contents of a directory. /// /// File systems must ensure that the directory contents have been flushed to disk before /// returning from this method. If `datasync` is true then only the directory data (but not the /// metadata) needs to be flushed. /// /// `handle` is the `Handle` returned by the file system from the `opendir` method, if any. If /// the file system did not return a `Handle` from `opendir` then the contents of /// `handle` are undefined. /// /// If this method returns an `ENOSYS` error then the kernel will treat it as success and all /// subsequent calls to `fsyncdir` will be handled by the kernel without being forwarded to the /// file system. fn fsyncdir( &self, ctx: Context, inode: Self::Inode, datasync: bool, handle: Self::Handle, ) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Release an open directory. /// /// For every `opendir` call there will be exactly one `releasedir` call (unless the file system /// is force-unmounted). /// /// `handle` is the `Handle` returned by the file system from the `opendir` method, if any. If /// the file system did not return a `Handle` from `opendir` then the contents of `handle` are /// undefined. /// /// `flags` contains used the flags used to open the directory in `opendir`. fn releasedir( &self, ctx: Context, inode: Self::Inode, flags: u32, handle: Self::Handle, ) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Check file access permissions. /// /// This method is called when a userspace process in the client makes an `access()` or /// `chdir()` system call. If the file system was mounted with the `-o default_permissions` /// mount option, then the kernel will perform these checks itself and this method will not be /// called. /// /// If this method returns an `ENOSYS` error, then the kernel will treat it as a permanent /// success: all future calls to `access` will return success without being forwarded to the /// file system. fn access(&self, ctx: Context, inode: Self::Inode, mask: u32) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Reposition read/write file offset. fn lseek( &self, ctx: Context, inode: Self::Inode, handle: Self::Handle, offset: u64, whence: u32, ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } #[allow(clippy::too_many_arguments)] fn copyfilerange( &self, ctx: Context, inode_in: Self::Inode, handle_in: Self::Handle, offset_in: u64, inode_out: Self::Inode, handle_out: Self::Handle, offset_out: u64, len: u64, flags: u64, ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Synchronize the filesystem containing the file referenced by `inode`. When running with /// --announce-submounts, `syncfs` is called once per submount that is to be synced. When /// running without--announce-submounts, `syncfs` is called on the root mount, but all submounts /// need to be synced, too. fn syncfs(&self, _ctx: Context, inode: Self::Inode) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// TODO: support this fn getlk(&self) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// TODO: support this fn setlk(&self) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// TODO: support this fn setlkw(&self) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// TODO: support this fn ioctl(&self) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// TODO: support this fn bmap(&self) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// TODO: support this fn poll(&self) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// TODO: support this fn notify_reply(&self) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// TODO: support this fn tmpfile(&self) -> io::Result<(Entry, Option, OpenOptions)> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } } /// Allow filesystem's state to be serialized for migration. pub trait SerializableFileSystem { /// Prepare serialization of the filesystem state. /// /// Called once migration is initiated. If serialization of the filesystem state takes time, /// this allows starting serialization now, so that `serialize()` has less work to do and can /// finish quickly. /// /// This function is generally run in a separate thread, so is allowed to block and take time /// to complete. It should regularly check the value of the `cancel` bool, though, and if it /// becomes set, cancel the preparation and return as soon as reasonably possible. fn prepare_serialization(&self, _cancel: Arc) {} /// Serialize the filesystem state. /// /// Called during migration on the source side, when the vhost front-end requests this state in /// order to put it into its migration stream and transfer it to the destination. /// /// This function should finish quickly and not perform complex tasks. If such tasks are /// necessary, they should be started by `prepare_serialization()` so they can be done in time /// for `serialize()`. fn serialize(&self, _state_pipe: File) -> io::Result<()> { Err(io::Error::new( io::ErrorKind::Unsupported, "State serialization not supported", )) } /// Deserialize the filesystem state, and apply it. /// /// Called during migration on the destination side, when the vhost front-end has received this /// state from the source. We should deserialize and apply it, so we can resume where the /// source has left off. /// /// This function should finish quickly. Any complex tasks should be deferred, if possible. fn deserialize_and_apply(&self, _state_pipe: File) -> io::Result<()> { Err(io::Error::new( io::ErrorKind::Unsupported, "State deserialization not supported", )) } } virtiofsd-1.13.0/src/fuse.rs000064400000000000000000001171461046102023000140300ustar 00000000000000// Copyright 2019 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use std::convert::TryFrom; use std::io; use crate::macros::enum_value; use crate::soft_idmap::{GuestGid, GuestUid, HostGid, HostUid}; use bitflags::bitflags; use vm_memory::ByteValued; /// Version number of this interface. pub const KERNEL_VERSION: u32 = 7; /// Minor version number of this interface. pub const KERNEL_MINOR_VERSION: u32 = 38; /// Minimum Minor version number supported. If client sends a minor /// number lesser than this, we don't support it. pub const MIN_KERNEL_MINOR_VERSION: u32 = 27; /// The ID of the inode corresponding to the root directory of the file system. pub const ROOT_ID: u64 = 1; // Bitmasks for `fuse_setattr_in.valid`. const FATTR_MODE: u32 = 1 << 0; const FATTR_UID: u32 = 1 << 1; const FATTR_GID: u32 = 1 << 2; const FATTR_SIZE: u32 = 1 << 3; const FATTR_ATIME: u32 = 1 << 4; const FATTR_MTIME: u32 = 1 << 5; pub const FATTR_FH: u32 = 1 << 6; const FATTR_ATIME_NOW: u32 = 1 << 7; const FATTR_MTIME_NOW: u32 = 1 << 8; pub const FATTR_LOCKOWNER: u32 = 1 << 9; const FATTR_CTIME: u32 = 1 << 10; const FATTR_KILL_SUIDGID: u32 = 1 << 11; bitflags! { pub struct SetattrValid: u32 { const MODE = FATTR_MODE; const UID = FATTR_UID; const GID = FATTR_GID; const SIZE = FATTR_SIZE; const ATIME = FATTR_ATIME; const MTIME = FATTR_MTIME; const ATIME_NOW = FATTR_ATIME_NOW; const MTIME_NOW = FATTR_MTIME_NOW; const CTIME = FATTR_CTIME; const KILL_SUIDGID = FATTR_KILL_SUIDGID; } } // Flags returned by the OPEN request. /// Bypass page cache for this open file. const FOPEN_DIRECT_IO: u32 = 1 << 0; /// Don't invalidate the data cache on open. const FOPEN_KEEP_CACHE: u32 = 1 << 1; /// The file is not seekable. const FOPEN_NONSEEKABLE: u32 = 1 << 2; /// Allow caching this directory. const FOPEN_CACHE_DIR: u32 = 1 << 3; /// The file is stream-like (no file position at all). #[allow(dead_code)] const FOPEN_STREAM: u32 = 1 << 4; /// Don't flush data cache on close (unless FUSE_WRITEBACK_CACHE) const FOPEN_NOFLUSH: u32 = 1 << 5; /// Allow concurrent direct writes on the same inode const FOPEN_PARALLEL_DIRECT_WRITES: u32 = 1 << 6; bitflags! { /// Options controlling the behavior of files opened by the server in response /// to an open or create request. pub struct OpenOptions: u32 { const DIRECT_IO = FOPEN_DIRECT_IO; const KEEP_CACHE = FOPEN_KEEP_CACHE; const NONSEEKABLE = FOPEN_NONSEEKABLE; const CACHE_DIR = FOPEN_CACHE_DIR; const STREAM = FOPEN_CACHE_DIR; const NOFLUSH = FOPEN_NOFLUSH; const PARALLEL_DIRECT_WRITES = FOPEN_PARALLEL_DIRECT_WRITES; } } // INIT request/reply flags. /// Asynchronous read requests. const ASYNC_READ: u64 = 1 << 0; /// Remote locking for POSIX file locks. const POSIX_LOCKS: u64 = 1 << 1; /// Kernel sends file handle for fstat, etc... (not yet supported). const FILE_OPS: u64 = 1 << 2; /// Handles the O_TRUNC open flag in the filesystem. const ATOMIC_O_TRUNC: u64 = 1 << 3; /// FileSystem handles lookups of "." and "..". const EXPORT_SUPPORT: u64 = 1 << 4; /// FileSystem can handle write size larger than 4kB. const BIG_WRITES: u64 = 1 << 5; /// Don't apply umask to file mode on create operations. const DONT_MASK: u64 = 1 << 6; /// Kernel supports splice write on the device. const SPLICE_WRITE: u64 = 1 << 7; /// Kernel supports splice move on the device. const SPLICE_MOVE: u64 = 1 << 8; /// Kernel supports splice read on the device. const SPLICE_READ: u64 = 1 << 9; /// Remote locking for BSD style file locks. const FLOCK_LOCKS: u64 = 1 << 10; /// Kernel supports ioctl on directories. const HAS_IOCTL_DIR: u64 = 1 << 11; /// Automatically invalidate cached pages. const AUTO_INVAL_DATA: u64 = 1 << 12; /// Do READDIRPLUS (READDIR+LOOKUP in one). const DO_READDIRPLUS: u64 = 1 << 13; /// Adaptive readdirplus. const READDIRPLUS_AUTO: u64 = 1 << 14; /// Asynchronous direct I/O submission. const ASYNC_DIO: u64 = 1 << 15; /// Use writeback cache for buffered writes. const WRITEBACK_CACHE: u64 = 1 << 16; /// Kernel supports zero-message opens. const NO_OPEN_SUPPORT: u64 = 1 << 17; /// Allow parallel lookups and readdir. const PARALLEL_DIROPS: u64 = 1 << 18; /// Fs handles killing suid/sgid/cap on write/chown/trunc. const HANDLE_KILLPRIV: u64 = 1 << 19; /// FileSystem supports posix acls. const POSIX_ACL: u64 = 1 << 20; /// Reading the device after abort returns ECONNABORTED. const ABORT_ERROR: u64 = 1 << 21; /// Init_out.max_pages contains the max number of req pages. const MAX_PAGES: u64 = 1 << 22; /// Cache READLINK responses const CACHE_SYMLINKS: u64 = 1 << 23; /// Kernel supports zero-message opendir const NO_OPENDIR_SUPPORT: u64 = 1 << 24; /// Only invalidate cached pages on explicit request const EXPLICIT_INVAL_DATA: u64 = 1 << 25; /// init_out.map_alignment contains log2(byte alignment) for /// foffset and moffset fields in struct fuse_setupmapping_out and /// fuse_removemapping_one #[allow(dead_code)] const MAP_ALIGNMENT: u64 = 1 << 26; /// Kernel supports auto-mounting directory submounts const SUBMOUNTS: u64 = 1 << 27; /// Fs handles killing suid/sgid/cap on write/chown/trunc (v2). const HANDLE_KILLPRIV_V2: u64 = 1 << 28; /// Server supports extended struct SetxattrIn const SETXATTR_EXT: u64 = 1 << 29; /// Extended fuse_init_in request const INIT_EXT: u64 = 1 << 30; /// Reserved. Do not use. const INIT_RESERVED: u64 = 1 << 31; /// Add security context to create, mkdir, symlink, and mknod const SECURITY_CTX: u64 = 1 << 32; /// Use per inode DAX const HAS_INODE_DAX: u64 = 1 << 33; /// Add supplementary groups info to create, mkdir, symlink /// and mknod (single group that matches parent) const CREATE_SUPP_GROUP: u64 = 1 << 34; /// Relax restrictions in FOPEN_DIRECT_IO mode to allow shared mmap. const DIRECT_IO_ALLOW_MMAP: u64 = 1 << 36; /// We need this for idmapped mounts support const ALLOW_IDMAP: u64 = 1 << 40; bitflags! { /// A bitfield passed in as a parameter to and returned from the `init` method of the /// `FileSystem` trait. pub struct FsOptions: u64 { /// Indicates that the filesystem supports asynchronous read requests. /// /// If this capability is not requested/available, the kernel will ensure that there is at /// most one pending read request per file-handle at any time, and will attempt to order /// read requests by increasing offset. /// /// This feature is enabled by default when supported by the kernel. const ASYNC_READ = ASYNC_READ; /// Indicates that the filesystem supports "remote" locking. /// /// This feature is not enabled by default and should only be set if the filesystem /// implements the `getlk` and `setlk` methods of the `FileSystem` trait. const POSIX_LOCKS = POSIX_LOCKS; /// Kernel sends file handle for fstat, etc... (not yet supported). const FILE_OPS = FILE_OPS; /// Indicates that the filesystem supports the `O_TRUNC` open flag. If disabled, and an /// application specifies `O_TRUNC`, fuse first calls `setattr` to truncate the file and /// then calls `open` with `O_TRUNC` filtered out. /// /// This feature is enabled by default when supported by the kernel. const ATOMIC_O_TRUNC = ATOMIC_O_TRUNC; /// Indicates that the filesystem supports lookups of "." and "..". /// /// This feature is disabled by default. const EXPORT_SUPPORT = EXPORT_SUPPORT; /// FileSystem can handle write size larger than 4kB. const BIG_WRITES = BIG_WRITES; /// Indicates that the kernel should not apply the umask to the file mode on create /// operations. /// /// This feature is disabled by default. const DONT_MASK = DONT_MASK; /// Indicates that the server should try to use `splice(2)` when writing to the fuse device. /// This may improve performance. /// /// This feature is not currently supported. const SPLICE_WRITE = SPLICE_WRITE; /// Indicates that the server should try to move pages instead of copying when writing to / /// reading from the fuse device. This may improve performance. /// /// This feature is not currently supported. const SPLICE_MOVE = SPLICE_MOVE; /// Indicates that the server should try to use `splice(2)` when reading from the fuse /// device. This may improve performance. /// /// This feature is not currently supported. const SPLICE_READ = SPLICE_READ; /// If set, then calls to `flock` will be emulated using POSIX locks and must /// then be handled by the filesystem's `setlock()` handler. /// /// If not set, `flock` calls will be handled by the FUSE kernel module internally (so any /// access that does not go through the kernel cannot be taken into account). /// /// This feature is disabled by default. const FLOCK_LOCKS = FLOCK_LOCKS; /// Indicates that the filesystem supports ioctl's on directories. /// /// This feature is enabled by default when supported by the kernel. const HAS_IOCTL_DIR = HAS_IOCTL_DIR; /// Traditionally, while a file is open the FUSE kernel module only asks the filesystem for /// an update of the file's attributes when a client attempts to read beyond EOF. This is /// unsuitable for e.g. network filesystems, where the file contents may change without the /// kernel knowing about it. /// /// If this flag is set, FUSE will check the validity of the attributes on every read. If /// the attributes are no longer valid (i.e., if the *attribute* timeout has expired) then /// FUSE will first send another `getattr` request. If the new mtime differs from the /// previous value, any cached file *contents* will be invalidated as well. /// /// This flag should always be set when available. If all file changes go through the /// kernel, *attribute* validity should be set to a very large number to avoid unnecessary /// `getattr()` calls. /// /// This feature is enabled by default when supported by the kernel. const AUTO_INVAL_DATA = AUTO_INVAL_DATA; /// Indicates that the filesystem supports readdirplus. /// /// The feature is not enabled by default and should only be set if the filesystem /// implements the `readdirplus` method of the `FileSystem` trait. const DO_READDIRPLUS = DO_READDIRPLUS; /// Indicates that the filesystem supports adaptive readdirplus. /// /// If `DO_READDIRPLUS` is not set, this flag has no effect. /// /// If `DO_READDIRPLUS` is set and this flag is not set, the kernel will always issue /// `readdirplus()` requests to retrieve directory contents. /// /// If `DO_READDIRPLUS` is set and this flag is set, the kernel will issue both `readdir()` /// and `readdirplus()` requests, depending on how much information is expected to be /// required. /// /// This feature is not enabled by default and should only be set if the file system /// implements both the `readdir` and `readdirplus` methods of the `FileSystem` trait. const READDIRPLUS_AUTO = READDIRPLUS_AUTO; /// Indicates that the filesystem supports asynchronous direct I/O submission. /// /// If this capability is not requested/available, the kernel will ensure that there is at /// most one pending read and one pending write request per direct I/O file-handle at any /// time. /// /// This feature is enabled by default when supported by the kernel. const ASYNC_DIO = ASYNC_DIO; /// Indicates that writeback caching should be enabled. This means that individual write /// request may be buffered and merged in the kernel before they are sent to the file /// system. /// /// This feature is disabled by default. const WRITEBACK_CACHE = WRITEBACK_CACHE; /// Indicates support for zero-message opens. If this flag is set in the `capable` parameter /// of the `init` trait method, then the file system may return `ENOSYS` from the open() handler /// to indicate success. Further attempts to open files will be handled in the kernel. (If /// this flag is not set, returning ENOSYS will be treated as an error and signaled to the /// caller). /// /// Setting (or not setting) the field in the `FsOptions` returned from the `init` method /// has no effect. const ZERO_MESSAGE_OPEN = NO_OPEN_SUPPORT; /// Indicates support for parallel directory operations. If this flag is unset, the FUSE /// kernel module will ensure that lookup() and readdir() requests are never issued /// concurrently for the same directory. /// /// This feature is enabled by default when supported by the kernel. const PARALLEL_DIROPS = PARALLEL_DIROPS; /// Indicates that the file system is responsible for unsetting setuid and setgid bits when a /// file is written, truncated, or its owner is changed. /// /// This feature is not currently supported. const HANDLE_KILLPRIV = HANDLE_KILLPRIV; /// Indicates support for POSIX ACLs. /// /// If this feature is enabled, the kernel will cache and have responsibility for enforcing /// ACLs. ACL will be stored as xattrs and passed to userspace, which is responsible for /// updating the ACLs in the filesystem, keeping the file mode in sync with the ACL, and /// ensuring inheritance of default ACLs when new filesystem nodes are created. Note that /// this requires that the file system is able to parse and interpret the xattr /// representation of ACLs. /// /// Enabling this feature implicitly turns on the `default_permissions` mount option (even /// if it was not passed to mount(2)). /// /// This feature is disabled by default. const POSIX_ACL = POSIX_ACL; /// Indicates that if the connection is gone because of sysfs abort, reading from the device /// will return -ECONNABORTED. /// /// This feature is not currently supported. const ABORT_ERROR = ABORT_ERROR; /// Indicates support for negotiating the maximum number of pages supported. /// /// If this feature is enabled, we can tell the kernel the maximum number of pages that we /// support to transfer in a single request. /// /// This feature is enabled by default if supported by the kernel. const MAX_PAGES = MAX_PAGES; /// Indicates that the kernel supports caching READLINK responses. /// /// This feature is not currently supported. const CACHE_SYMLINKS = CACHE_SYMLINKS; /// Indicates support for zero-message opens. If this flag is set in the `capable` parameter /// of the `init` trait method, then the file system may return `ENOSYS` from the opendir() handler /// to indicate success. Further attempts to open directories will be handled in the kernel. (If /// this flag is not set, returning ENOSYS will be treated as an error and signaled to the /// caller). /// /// Setting (or not setting) the field in the `FsOptions` returned from the `init` method /// has no effect. const ZERO_MESSAGE_OPENDIR = NO_OPENDIR_SUPPORT; /// Indicates support for explicit data invalidation. If this feature is enabled, the /// server is fully responsible for data cache invalidation, and the kernel won't /// invalidate files data cache on size change and only truncate that cache to new size /// in case the size decreased. /// /// This feature is not currently supported. const EXPLICIT_INVAL_DATA = EXPLICIT_INVAL_DATA; /// Indicates that the kernel supports the FUSE_ATTR_SUBMOUNT flag. /// /// Setting (or not setting) this flag in the `FsOptions` returned from the `init` method /// has no effect. const SUBMOUNTS = SUBMOUNTS; /// Indicates that the filesystem is responsible for clearing /// security.capability xattr and clearing setuid and setgid bits. Following /// are the rules. /// - clear "security.capability" on write, truncate and chown unconditionally /// - clear suid/sgid if following is true. Note, sgid is cleared only if /// group executable bit is set. /// o setattr has FATTR_SIZE and FATTR_KILL_SUIDGID set. /// o setattr has FATTR_UID or FATTR_GID /// o open has O_TRUNC and FUSE_OPEN_KILL_SUIDGID /// o create has O_TRUNC and FUSE_OPEN_KILL_SUIDGID flag set. /// o write has FUSE_WRITE_KILL_SUIDGID /// /// This feature is enabled by default if supported by the kernel. const HANDLE_KILLPRIV_V2 = HANDLE_KILLPRIV_V2; /// Server supports extended struct SetxattrIn const SETXATTR_EXT = SETXATTR_EXT; /// Indicates that fuse_init_in structure has been extended and /// expect extended struct coming in from kernel. const INIT_EXT = INIT_EXT; /// This bit is reserved. Don't use it. const INIT_RESERVED = INIT_RESERVED; /// Indicates that kernel is capable of sending a security /// context at file creation time (create, mkdir, symlink /// and mknod). This is expected to be a SELinux security /// context as of now. const SECURITY_CTX = SECURITY_CTX; /// Indicates that kernel is capable of understanding /// per inode dax flag sent in response to getattr /// request. This will allow server to enable to /// enable dax on selective files. const HAS_INODE_DAX = HAS_INODE_DAX; /// Add supplementary groups info to create, mkdir, symlink /// and mknod (single group that matches parent). const CREATE_SUPP_GROUP = CREATE_SUPP_GROUP; /// Allow shared mmap'ing of files in DIRECT_IO. const DIRECT_IO_ALLOW_MMAP = DIRECT_IO_ALLOW_MMAP; /// Indicates if idmapped mounts are allowed for virtiofs. const ALLOW_IDMAP = ALLOW_IDMAP; } } // Release flags. pub const RELEASE_FLUSH: u32 = 1 << 0; pub const RELEASE_FLOCK_UNLOCK: u32 = 1 << 1; // Getattr flags. pub const GETATTR_FH: u32 = 1 << 0; // Lock flags. pub const LK_FLOCK: u32 = 1 << 0; // Write flags. /// Delayed write from page cache, file handle is guessed. pub const WRITE_CACHE: u32 = 1 << 0; /// `lock_owner` field is valid. pub const WRITE_LOCKOWNER: u32 = 1 << 1; /// Kill suid and sgid bits pub const WRITE_KILL_PRIV: u32 = 1 << 2; // Read flags. pub const READ_LOCKOWNER: u32 = 1 << 1; // Ioctl flags. /// 32bit compat ioctl on 64bit machine const IOCTL_COMPAT: u32 = 1 << 0; /// Not restricted to well-formed ioctls, retry allowed const IOCTL_UNRESTRICTED: u32 = 1 << 1; /// Retry with new iovecs const IOCTL_RETRY: u32 = 1 << 2; /// 32bit ioctl const IOCTL_32BIT: u32 = 1 << 3; /// Is a directory const IOCTL_DIR: u32 = 1 << 4; /// x32 compat ioctl on 64bit machine (64bit time_t) const IOCTL_COMPAT_X32: u32 = 1 << 5; /// Maximum of in_iovecs + out_iovecs const IOCTL_MAX_IOV: u32 = 256; bitflags! { pub struct IoctlFlags: u32 { /// 32bit compat ioctl on 64bit machine const IOCTL_COMPAT = IOCTL_COMPAT; /// Not restricted to well-formed ioctls, retry allowed const IOCTL_UNRESTRICTED = IOCTL_UNRESTRICTED; /// Retry with new iovecs const IOCTL_RETRY = IOCTL_RETRY; /// 32bit ioctl const IOCTL_32BIT = IOCTL_32BIT; /// Is a directory const IOCTL_DIR = IOCTL_DIR; /// x32 compat ioctl on 64bit machine (64bit time_t) const IOCTL_COMPAT_X32 = IOCTL_COMPAT_X32; /// Maximum of in_iovecs + out_iovecs const IOCTL_MAX_IOV = IOCTL_MAX_IOV; } } /// Request poll notify. pub const POLL_SCHEDULE_NOTIFY: u32 = 1 << 0; /// The read buffer is required to be at least 8k, but may be much larger. pub const FUSE_MIN_READ_BUFFER: u32 = 8192; pub const FUSE_COMPAT_ENTRY_OUT_SIZE: u32 = 120; pub const FUSE_COMPAT_ATTR_OUT_SIZE: u32 = 96; pub const FUSE_COMPAT_MKNOD_IN_SIZE: u32 = 8; pub const FUSE_COMPAT_WRITE_IN_SIZE: u32 = 24; pub const FUSE_COMPAT_STATFS_SIZE: u32 = 48; pub const FUSE_COMPAT_INIT_OUT_SIZE: u32 = 8; pub const FUSE_COMPAT_22_INIT_OUT_SIZE: u32 = 24; pub const FUSE_COMPAT_SETXATTR_IN_SIZE: u32 = 8; // Fsync flags pub const FSYNC_FDATASYNC: u32 = 1 << 0; // Attr.flags flags. /// Object is a submount root pub const ATTR_SUBMOUNT: u32 = 1 << 0; /// Indicate to kernel to enable DAX for this file in per inode DAX mode pub const ATTR_DAX: u32 = 1 << 1; // Open flags /// Kill suid and sgid if executable pub const OPEN_KILL_SUIDGID: u32 = 1 << 0; // setxattr flags /// Clear SGID when system.posix_acl_access is set const SETXATTR_ACL_KILL_SGID: u32 = 1 << 0; bitflags! { pub struct SetxattrFlags: u32 { /// Clear SGID when system.posix_acl_access is set const SETXATTR_ACL_KILL_SGID = SETXATTR_ACL_KILL_SGID; } } // Message definitions follow. It is safe to implement ByteValued for all of these // because they are POD types. #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct Attr { pub ino: u64, pub size: u64, pub blocks: u64, pub atime: u64, pub mtime: u64, pub ctime: u64, pub atimensec: u32, pub mtimensec: u32, pub ctimensec: u32, pub mode: u32, pub nlink: u32, pub uid: GuestUid, pub gid: GuestGid, pub rdev: u32, pub blksize: u32, pub flags: u32, } unsafe impl ByteValued for Attr {} impl Attr { /** * Turn a `stat64` result into its FUSE representation (`Attr`). * * Maps UID and GID from host to guest according to the mappings provided. */ pub fn try_from_stat64< UidMap: FnOnce(HostUid) -> io::Result, GidMap: FnOnce(HostGid) -> io::Result, >( st: libc::stat64, uid_map: UidMap, gid_map: GidMap, ) -> io::Result { Attr::try_with_flags(st, 0, uid_map, gid_map) } /** * Turn a `stat64` result into its FUSE representation (`Attr`), including `flags`. * * Maps UID and GID from host to guest according to the mappings provided. */ pub fn try_with_flags< UidMap: FnOnce(HostUid) -> io::Result, GidMap: FnOnce(HostGid) -> io::Result, >( st: libc::stat64, flags: u32, uid_map: UidMap, gid_map: GidMap, ) -> io::Result { let uid = uid_map(HostUid::from(st.st_uid))?; let gid = gid_map(HostGid::from(st.st_gid))?; Ok(Attr { ino: st.st_ino, size: st.st_size as u64, blocks: st.st_blocks as u64, atime: st.st_atime as u64, mtime: st.st_mtime as u64, ctime: st.st_ctime as u64, atimensec: st.st_atime_nsec as u32, mtimensec: st.st_mtime_nsec as u32, ctimensec: st.st_ctime_nsec as u32, mode: st.st_mode, nlink: st.st_nlink as u32, uid, gid, rdev: st.st_rdev as u32, blksize: st.st_blksize as u32, flags, }) } } #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct Kstatfs { pub blocks: u64, pub bfree: u64, pub bavail: u64, pub files: u64, pub ffree: u64, pub bsize: u32, pub namelen: u32, pub frsize: u32, pub padding: u32, pub spare: [u32; 6], } unsafe impl ByteValued for Kstatfs {} impl From for Kstatfs { fn from(st: libc::statvfs64) -> Self { Kstatfs { blocks: st.f_blocks, bfree: st.f_bfree, bavail: st.f_bavail, files: st.f_files, ffree: st.f_ffree, bsize: st.f_bsize as u32, namelen: st.f_namemax as u32, frsize: st.f_frsize as u32, ..Default::default() } } } #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct FileLock { pub start: u64, pub end: u64, pub type_: u32, pub pid: u32, /* tgid */ } unsafe impl ByteValued for FileLock {} enum_value! { #[derive(Debug, Copy, Clone)] pub enum Opcode: u32 { Lookup = 1, Forget = 2, /* No Reply */ Getattr = 3, Setattr = 4, Readlink = 5, Symlink = 6, Mknod = 8, Mkdir = 9, Unlink = 10, Rmdir = 11, Rename = 12, Link = 13, Open = 14, Read = 15, Write = 16, Statfs = 17, Release = 18, Fsync = 20, Setxattr = 21, Getxattr = 22, Listxattr = 23, Removexattr = 24, Flush = 25, Init = 26, Opendir = 27, Readdir = 28, Releasedir = 29, Fsyncdir = 30, Getlk = 31, Setlk = 32, Setlkw = 33, Access = 34, Create = 35, Interrupt = 36, Bmap = 37, Destroy = 38, Ioctl = 39, Poll = 40, NotifyReply = 41, BatchForget = 42, Fallocate = 43, Readdirplus = 44, Rename2 = 45, Lseek = 46, CopyFileRange = 47, SetupMapping = 48, RemoveMapping = 49, Syncfs = 50, TmpFile = 51, } } #[repr(u32)] #[derive(Debug, Copy, Clone)] pub enum NotifyOpcode { Poll = 1, InvalInode = 2, InvalEntry = 3, Store = 4, Retrieve = 5, Delete = 6, CodeMax = 7, } #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct EntryOut { pub nodeid: u64, /* Inode ID */ pub generation: u64, /* Inode generation: nodeid:gen must be unique for the fs's lifetime */ pub entry_valid: u64, /* Cache timeout for the name */ pub attr_valid: u64, /* Cache timeout for the attributes */ pub entry_valid_nsec: u32, pub attr_valid_nsec: u32, pub attr: Attr, } unsafe impl ByteValued for EntryOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct ForgetIn { pub nlookup: u64, } unsafe impl ByteValued for ForgetIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct ForgetOne { pub nodeid: u64, pub nlookup: u64, } unsafe impl ByteValued for ForgetOne {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct BatchForgetIn { pub count: u32, pub dummy: u32, } unsafe impl ByteValued for BatchForgetIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct GetattrIn { pub flags: u32, pub dummy: u32, pub fh: u64, } unsafe impl ByteValued for GetattrIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct AttrOut { pub attr_valid: u64, /* Cache timeout for the attributes */ pub attr_valid_nsec: u32, pub dummy: u32, pub attr: Attr, } unsafe impl ByteValued for AttrOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct MknodIn { pub mode: u32, pub rdev: u32, pub umask: u32, pub padding: u32, } unsafe impl ByteValued for MknodIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct MkdirIn { pub mode: u32, pub umask: u32, } unsafe impl ByteValued for MkdirIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct RenameIn { pub newdir: u64, } unsafe impl ByteValued for RenameIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct Rename2In { pub newdir: u64, pub flags: u32, pub padding: u32, } unsafe impl ByteValued for Rename2In {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct LinkIn { pub oldnodeid: u64, } unsafe impl ByteValued for LinkIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct SetattrIn { pub valid: u32, pub padding: u32, pub fh: u64, pub size: u64, pub lock_owner: u64, pub atime: u64, pub mtime: u64, pub ctime: u64, pub atimensec: u32, pub mtimensec: u32, pub ctimensec: u32, pub mode: u32, pub unused4: u32, pub uid: GuestUid, pub gid: GuestGid, pub unused5: u32, } unsafe impl ByteValued for SetattrIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct OpenIn { pub flags: u32, pub open_flags: u32, } unsafe impl ByteValued for OpenIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct CreateIn { pub flags: u32, pub mode: u32, pub umask: u32, pub open_flags: u32, } unsafe impl ByteValued for CreateIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct OpenOut { pub fh: u64, pub open_flags: u32, pub padding: u32, } unsafe impl ByteValued for OpenOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct ReleaseIn { pub fh: u64, pub flags: u32, pub release_flags: u32, pub lock_owner: u64, } unsafe impl ByteValued for ReleaseIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct FlushIn { pub fh: u64, pub unused: u32, pub padding: u32, pub lock_owner: u64, } unsafe impl ByteValued for FlushIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct ReadIn { pub fh: u64, pub offset: u64, pub size: u32, pub read_flags: u32, pub lock_owner: u64, pub flags: u32, pub padding: u32, } unsafe impl ByteValued for ReadIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct WriteIn { pub fh: u64, pub offset: u64, pub size: u32, pub write_flags: u32, pub lock_owner: u64, pub flags: u32, pub padding: u32, } unsafe impl ByteValued for WriteIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct WriteOut { pub size: u32, pub padding: u32, } unsafe impl ByteValued for WriteOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct StatfsOut { pub st: Kstatfs, } unsafe impl ByteValued for StatfsOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct FsyncIn { pub fh: u64, pub fsync_flags: u32, pub padding: u32, } unsafe impl ByteValued for FsyncIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct SetxattrIn { pub size: u32, pub flags: u32, pub setxattr_flags: u32, pub padding: u32, } unsafe impl ByteValued for SetxattrIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct SetxattrInCompat { pub size: u32, pub flags: u32, } unsafe impl ByteValued for SetxattrInCompat {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct GetxattrIn { pub size: u32, pub padding: u32, } unsafe impl ByteValued for GetxattrIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct GetxattrOut { pub size: u32, pub padding: u32, } unsafe impl ByteValued for GetxattrOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct LkIn { pub fh: u64, pub owner: u64, pub lk: FileLock, pub lk_flags: u32, pub padding: u32, } unsafe impl ByteValued for LkIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct LkOut { pub lk: FileLock, } unsafe impl ByteValued for LkOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct AccessIn { pub mask: u32, pub padding: u32, } unsafe impl ByteValued for AccessIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct InitInCompat { pub major: u32, pub minor: u32, pub max_readahead: u32, pub flags: u32, } unsafe impl ByteValued for InitInCompat {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct InitInExt { pub flags2: u32, pub unused: [u32; 11], } unsafe impl ByteValued for InitInExt {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct InitOut { pub major: u32, pub minor: u32, pub max_readahead: u32, pub flags: u32, pub max_background: u16, pub congestion_threshold: u16, pub max_write: u32, pub time_gran: u32, pub max_pages: u16, pub map_alignment: u16, pub flags2: u32, pub unused: [u32; 7], } unsafe impl ByteValued for InitOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct InterruptIn { pub unique: u64, } unsafe impl ByteValued for InterruptIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct BmapIn { pub block: u64, pub blocksize: u32, pub padding: u32, } unsafe impl ByteValued for BmapIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct BmapOut { pub block: u64, } unsafe impl ByteValued for BmapOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct IoctlIn { pub fh: u64, pub flags: u32, pub cmd: u32, pub arg: u64, pub in_size: u32, pub out_size: u32, } unsafe impl ByteValued for IoctlIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct IoctlIovec { pub base: u64, pub len: u64, } unsafe impl ByteValued for IoctlIovec {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct IoctlOut { pub result: i32, pub flags: u32, pub in_iovs: u32, pub out_iovs: u32, } unsafe impl ByteValued for IoctlOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct PollIn { pub fh: u64, pub kh: u64, pub flags: u32, pub events: u32, } unsafe impl ByteValued for PollIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct PollOut { pub revents: u32, pub padding: u32, } unsafe impl ByteValued for PollOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct NotifyPollWakeupOut { pub kh: u64, } unsafe impl ByteValued for NotifyPollWakeupOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct FallocateIn { pub fh: u64, pub offset: u64, pub length: u64, pub mode: u32, pub padding: u32, } unsafe impl ByteValued for FallocateIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct InHeader { pub len: u32, pub opcode: u32, pub unique: u64, pub nodeid: u64, pub uid: GuestUid, pub gid: GuestGid, pub pid: u32, pub total_extlen: u16, // length of extensions in 8-byte units pub padding: u16, } unsafe impl ByteValued for InHeader {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct OutHeader { pub len: u32, pub error: i32, pub unique: u64, } unsafe impl ByteValued for OutHeader {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct Dirent { pub ino: u64, pub off: u64, pub namelen: u32, pub type_: u32, // char name[]; } unsafe impl ByteValued for Dirent {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct Direntplus { pub entry_out: EntryOut, pub dirent: Dirent, } unsafe impl ByteValued for Direntplus {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct NotifyInvalInodeOut { pub ino: u64, pub off: i64, pub len: i64, } unsafe impl ByteValued for NotifyInvalInodeOut {} const FUSE_EXPIRE_ONLY: u32 = 1 << 0; bitflags! { pub struct NotifyInvalEntryOutFlags: u32 { const EXPIRE_ONLY = FUSE_EXPIRE_ONLY; } } #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct NotifyInvalEntryOut { pub parent: u64, pub namelen: u32, pub flags: u32, } unsafe impl ByteValued for NotifyInvalEntryOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct NotifyDeleteOut { pub parent: u64, pub child: u64, pub namelen: u32, pub padding: u32, } unsafe impl ByteValued for NotifyDeleteOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct NotifyStoreOut { pub nodeid: u64, pub offset: u64, pub size: u32, pub padding: u32, } unsafe impl ByteValued for NotifyStoreOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct Notify_Retrieve_Out { pub notify_unique: u64, pub nodeid: u64, pub offset: u64, pub size: u32, pub padding: u32, } unsafe impl ByteValued for Notify_Retrieve_Out {} /* Matches the size of fuse_write_in */ #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct NotifyRetrieveIn { pub dummy1: u64, pub offset: u64, pub size: u32, pub dummy2: u32, pub dummy3: u64, pub dummy4: u64, } unsafe impl ByteValued for NotifyRetrieveIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct LseekIn { pub fh: u64, pub offset: u64, pub whence: u32, pub padding: u32, } unsafe impl ByteValued for LseekIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct LseekOut { pub offset: u64, } unsafe impl ByteValued for LseekOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct CopyfilerangeIn { pub fh_in: u64, pub off_in: u64, pub nodeid_out: u64, pub fh_out: u64, pub off_out: u64, pub len: u64, pub flags: u64, } unsafe impl ByteValued for CopyfilerangeIn {} const SETUPMAPPING_FLAG_WRITE: u64 = 1 << 0; const SETUPMAPPING_FLAG_READ: u64 = 1 << 1; bitflags! { pub struct SetupmappingFlags: u64 { const WRITE = SETUPMAPPING_FLAG_WRITE; const READ = SETUPMAPPING_FLAG_READ; } } #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct SetupmappingIn { pub fh: u64, pub foffset: u64, pub len: u64, pub flags: u64, pub moffset: u64, } unsafe impl ByteValued for SetupmappingIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct RemovemappingIn { pub count: u32, } unsafe impl ByteValued for RemovemappingIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct RemovemappingOne { pub moffset: u64, pub len: u64, } unsafe impl ByteValued for RemovemappingOne {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct SyncfsIn { pub padding: u64, } unsafe impl ByteValued for SyncfsIn {} /// Extension header /// `size`: total size of this extension including this header /// `ext_type`: type of extension /// This is made compatible with `SecctxHeader` by using type values > `FUSE_MAX_NR_SECCTX` #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct ExtHeader { pub size: u32, pub ext_type: u32, } /// Extension types /// Types `0..MAX_NR_SECCTX` are reserved for `SecCtx` extension for backward compatibility. const MAX_NR_SECCTX: u32 = 31; // Maximum value of `SecctxHeader::nr_secctx` const EXT_SUP_GROUPS: u32 = 32; unsafe impl ByteValued for ExtHeader {} /// Extension type #[derive(Debug, Copy, Clone)] pub enum ExtType { /// Security contexts SecCtx(u32), /// `Supplementary groups SupGroups, } impl TryFrom for ExtType { type Error = (); fn try_from(value: u32) -> Result { match value { v if v <= MAX_NR_SECCTX => Ok(Self::SecCtx(value)), v if v == EXT_SUP_GROUPS => Ok(Self::SupGroups), _ => Err(()), } } } /// For each security context, send `Secctx` with size of security context /// `Secctx` will be followed by security context name and this in turn /// will be followed by actual context label. /// `Secctx`, name, context #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct Secctx { pub size: u32, pub padding: u32, } unsafe impl ByteValued for Secctx {} /// Contains the information about how many `Secctx` structures are being /// sent and what's the total size of all security contexts (including /// size of `SecctxHeader`). #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct SecctxHeader { pub size: u32, pub nr_secctx: u32, } unsafe impl ByteValued for SecctxHeader {} /// Supplementary groups extension /// `nr_groups`: number of supplementary groups /// `groups`: flexible array of group IDs #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct SuppGroups { pub nr_groups: u32, // groups: [GuestGid]; } unsafe impl ByteValued for SuppGroups {} virtiofsd-1.13.0/src/idmap.rs000064400000000000000000000063511046102023000141530ustar 00000000000000// SPDX-License-Identifier: BSD-3-Clause use std::fmt; use std::num::ParseIntError; use std::str::FromStr; /// Expected error conditions with respect to parsing both UidMap and GidMap #[derive(Debug, Eq, PartialEq)] pub enum IdMapError { /// A delimiter has been found that does not match the delimiter the map started with. InvalidDelimiter, /// The map is empty or incorrect number of values are provided. IncompleteMap, /// Wraps the cause of parsing an integer failing. InvalidValue(ParseIntError), } impl std::error::Error for IdMapError {} impl fmt::Display for IdMapError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { IdMapError::InvalidDelimiter => write!( f, "A delimiter has been found that does not match the delimiter the map started with" ), IdMapError::IncompleteMap => write!( f, "The map is empty or incorrect number of values are provided" ), IdMapError::InvalidValue(err) => write!(f, "{}", err), } } } impl From for IdMapError { fn from(err: ParseIntError) -> Self { IdMapError::InvalidValue(err) } } #[derive(Clone, Debug, PartialEq, Eq)] pub struct UidMap { pub inside_uid: u32, pub outside_uid: u32, pub count: u32, } impl FromStr for UidMap { type Err = IdMapError; fn from_str(s: &str) -> Result { let fields = parse_idmap(s, 3)?; Ok(UidMap { inside_uid: fields[0], outside_uid: fields[1], count: fields[2], }) } } impl fmt::Display for UidMap { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, ":{}:{}:{}:", self.inside_uid, self.outside_uid, self.count ) } } #[derive(Clone, Debug, PartialEq, Eq)] pub struct GidMap { pub inside_gid: u32, pub outside_gid: u32, pub count: u32, } impl FromStr for GidMap { type Err = IdMapError; fn from_str(s: &str) -> Result { let fields = parse_idmap(s, 3)?; Ok(GidMap { inside_gid: fields[0], outside_gid: fields[1], count: fields[2], }) } } impl fmt::Display for GidMap { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, ":{}:{}:{}:", self.inside_gid, self.outside_gid, self.count ) } } fn parse_idmap(s: &str, expected_len: usize) -> Result, IdMapError> { let mut s = String::from(s); let delimiter = s.pop().ok_or(IdMapError::IncompleteMap)?; if delimiter.is_alphanumeric() { return Err(IdMapError::InvalidDelimiter); } let values: Vec<&str> = s .strip_prefix(delimiter) .ok_or(IdMapError::InvalidDelimiter)? .split(delimiter) .collect(); if values.len() != expected_len { return Err(IdMapError::IncompleteMap); } values .into_iter() .map(|v| v.parse().map_err(IdMapError::InvalidValue)) .collect() } #[derive(Debug, Eq, PartialEq)] #[repr(u8)] pub(crate) enum IdMapSetUpPipeMessage { Request = 0x1, Done = 0x2, } virtiofsd-1.13.0/src/lib.rs000064400000000000000000000046641046102023000136340ustar 00000000000000// Copyright © 2019 Intel Corporation // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause #[macro_use] extern crate log; pub mod descriptor_utils; pub mod file_traits; pub mod filesystem; pub mod fuse; pub mod idmap; pub mod limits; pub mod macros; pub mod oslib; pub mod passthrough; pub mod read_dir; pub mod sandbox; #[cfg(feature = "seccomp")] pub mod seccomp; pub mod server; pub mod soft_idmap; pub mod util; pub mod vhost_user; use std::ffi::{FromBytesWithNulError, FromVecWithNulError}; use std::{error, fmt, io}; #[derive(Debug)] pub enum Error { /// Failed to decode protocol messages. DecodeMessage(io::Error), /// Failed to encode protocol messages. EncodeMessage(io::Error), /// Failed to flush protocol messages. FlushMessage(io::Error), /// One or more parameters are missing. MissingParameter, /// A C string parameter is invalid. InvalidCString(FromBytesWithNulError), /// A C string parameter is invalid. InvalidCString2(FromVecWithNulError), /// The `len` field of the header is too small. InvalidHeaderLength, /// The `size` field of the `SetxattrIn` message does not match the length /// of the decoded value. InvalidXattrSize((u32, usize)), /// One or more extensions are missing. MissingExtension, } impl error::Error for Error {} impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use Error::*; match self { DecodeMessage(err) => write!(f, "failed to decode fuse message: {err}"), EncodeMessage(err) => write!(f, "failed to encode fuse message: {err}"), FlushMessage(err) => write!(f, "failed to flush fuse message: {err}"), MissingParameter => write!(f, "one or more parameters are missing"), InvalidHeaderLength => write!(f, "the `len` field of the header is too small"), InvalidCString(err) => write!(f, "a c string parameter is invalid: {err}"), InvalidCString2(err) => write!(f, "a c string parameter is invalid: {err}"), InvalidXattrSize((size, len)) => write!( f, "The `size` field of the `SetxattrIn` message does not match the length of the\ decoded value: size = {size}, value.len() = {len}" ), MissingExtension => write!(f, "one or more extensions are missing"), } } } pub type Result = ::std::result::Result; virtiofsd-1.13.0/src/limits.rs000064400000000000000000000052041046102023000143560ustar 00000000000000// SPDX-License-Identifier: BSD-3-Clause use libc::{getrlimit, rlim_t, rlimit, setrlimit, RLIMIT_NOFILE}; use std::mem::MaybeUninit; use std::{cmp, fs, io}; // Default number of open files (RLIMIT_NOFILE) const DEFAULT_NOFILE: rlim_t = 1_000_000; /// Gets the maximum number of open files. fn get_max_nofile() -> Result { let path = "/proc/sys/fs/nr_open"; let max_str = fs::read_to_string(path).map_err(|error| format!("Reading {path}: {error:?}"))?; max_str .trim() .parse() .map_err(|error| format!("Parsing {path}: {error:?}")) } /// Gets the hard limit of open files. fn get_nofile_limits() -> Result { let mut limits = MaybeUninit::::zeroed(); let ret = unsafe { getrlimit(RLIMIT_NOFILE, limits.as_mut_ptr()) }; if ret != 0 { return Err(format!("getrlimit: {}", io::Error::last_os_error())); } Ok(unsafe { limits.assume_init() }) } /// Sets the limit of open files to the given value. fn setup_rlimit_nofile_to(nofile: rlim_t) -> Result<(), String> { let rlimit = rlimit { rlim_cur: nofile, rlim_max: nofile, }; let ret = unsafe { setrlimit(RLIMIT_NOFILE, &rlimit) }; if ret < 0 { Err(format!( "Failed to increase the limit: {:?}", io::Error::last_os_error() )) } else { Ok(()) } } pub fn setup_rlimit_nofile(nofile: Option) -> Result<(), String> { let max_nofile = get_max_nofile()?; let rlimit { rlim_cur, rlim_max } = get_nofile_limits()?; let target_limit = if let Some(nofile) = nofile { if nofile == 0 { return Ok(()); // '--rlimit-nofile=0' leaves the resource limit unchanged } nofile } else { if DEFAULT_NOFILE <= rlim_cur { return Ok(()); // the user has already setup the soft limit higher than the target } cmp::min(DEFAULT_NOFILE, max_nofile) }; if target_limit > max_nofile { return Err(format!("It cannot be increased above {max_nofile}")); } if let Err(error) = setup_rlimit_nofile_to(target_limit) { if nofile.is_some() { // Error attempting to setup user-supplied value return Err(error); } else { warn!( "Failure when trying to set the limit to {}, \ the hard limit ({}) of open file descriptors is used instead.", target_limit, rlim_max ); setup_rlimit_nofile_to(rlim_max).map_err(|error| { format!("Cannot increase the soft limit to the hard limit: {error}") })? } } Ok(()) } virtiofsd-1.13.0/src/macros.rs000064400000000000000000000016011046102023000143360ustar 00000000000000// Copyright 2022 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. macro_rules! enum_value { ( $(#[$meta:meta])* $vis:vis enum $enum:ident: $T:tt { $( $(#[$variant_meta:meta])* $variant:ident $(= $val:expr)?, )* } ) => { #[repr($T)] $(#[$meta])* $vis enum $enum { $($(#[$variant_meta])* $variant $(= $val)?,)* } impl std::convert::TryFrom<$T> for $enum { type Error = (); fn try_from(v: $T) -> Result { match v { $(v if v == $enum::$variant as $T => Ok($enum::$variant),)* _ => Err(()), } } } } } pub(crate) use enum_value; virtiofsd-1.13.0/src/main.rs000064400000000000000000001026351046102023000140070ustar 00000000000000// Copyright 2019 Intel Corporation. All Rights Reserved. // // SPDX-License-Identifier: (Apache-2.0 AND BSD-3-Clause) use log::*; use passthrough::xattrmap::XattrMap; use std::collections::HashSet; use std::convert::TryFrom; use std::ffi::CString; use std::os::unix::io::{FromRawFd, RawFd}; use std::path::Path; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; use std::{env, process}; use virtiofsd::idmap::{GidMap, UidMap}; use clap::{CommandFactory, Parser}; use vhost::vhost_user::Error::Disconnected; use vhost::vhost_user::Listener; use vhost_user_backend::Error::HandleRequest; use vhost_user_backend::VhostUserDaemon; use virtiofsd::filesystem::{FileSystem, SerializableFileSystem}; use virtiofsd::passthrough::read_only::PassthroughFsRo; use virtiofsd::passthrough::{ self, CachePolicy, InodeFileHandlesMode, MigrationMode, MigrationOnError, PassthroughFs, }; use virtiofsd::sandbox::{Sandbox, SandboxMode}; use virtiofsd::seccomp::{enable_seccomp, SeccompAction}; use virtiofsd::util::write_pid_file; use virtiofsd::vhost_user::{Error, VhostUserFsBackendBuilder, MAX_TAG_LEN}; use virtiofsd::{limits, oslib, soft_idmap}; use vm_memory::{GuestMemoryAtomic, GuestMemoryMmap}; type Result = std::result::Result; fn parse_seccomp(src: &str) -> std::result::Result { Ok(match src { "none" => SeccompAction::Allow, // i.e. no seccomp "kill" => SeccompAction::Kill, "log" => SeccompAction::Log, "trap" => SeccompAction::Trap, _ => return Err("Matching variant not found"), }) } /// On the command line, we want to allow aliases for `InodeFileHandlesMode` values. This enum has /// all values allowed on the command line, and with `From`/`Into`, it can be translated into the /// internally used `InodeFileHandlesMode` enum. #[derive(Debug, Copy, Clone, PartialEq, Eq)] enum InodeFileHandlesCommandLineMode { /// `InodeFileHandlesMode::Never` Never, /// Alias for `InodeFileHandlesMode::Prefer` Fallback, /// `InodeFileHandlesMode::Prefer` Prefer, /// `InodeFileHandlesMode::Mandatory` Mandatory, } impl From for InodeFileHandlesMode { fn from(clm: InodeFileHandlesCommandLineMode) -> Self { match clm { InodeFileHandlesCommandLineMode::Never => InodeFileHandlesMode::Never, InodeFileHandlesCommandLineMode::Fallback => InodeFileHandlesMode::Prefer, InodeFileHandlesCommandLineMode::Prefer => InodeFileHandlesMode::Prefer, InodeFileHandlesCommandLineMode::Mandatory => InodeFileHandlesMode::Mandatory, } } } impl FromStr for InodeFileHandlesCommandLineMode { type Err = &'static str; fn from_str(s: &str) -> std::result::Result { match s { "never" => Ok(InodeFileHandlesCommandLineMode::Never), "fallback" => Ok(InodeFileHandlesCommandLineMode::Fallback), "prefer" => Ok(InodeFileHandlesCommandLineMode::Prefer), "mandatory" => Ok(InodeFileHandlesCommandLineMode::Mandatory), _ => Err("invalid inode file handles mode"), } } } fn parse_tag(tag: &str) -> Result { if !tag.is_empty() && tag.len() <= MAX_TAG_LEN { Ok(tag.into()) } else { Err(Error::InvalidTag) } } #[derive(Clone, Debug, Parser)] #[command( name = "virtiofsd", about = "Launch a virtiofsd backend.", version, args_override_self = true, arg_required_else_help = true, override_usage = "virtiofsd --shared-dir --socket-path [OPTIONS]" )] struct Opt { /// Shared directory path #[arg(long, required_unless_present_any = &["compat_options", "print_capabilities"])] shared_dir: Option, /// The tag that the virtio device advertises /// /// Setting this option will enable advertising of /// VHOST_USER_PROTOCOL_F_CONFIG. However, the vhost-user frontend of your /// hypervisor may not negotiate this feature and (or) ignore this value. /// Notably, QEMU currently (as of 8.1) ignores the CONFIG feature. QEMU /// versions from 7.1 to 8.0 will crash while attempting to log a warning /// about not supporting the feature. #[arg(long, value_parser = parse_tag)] tag: Option, /// vhost-user socket path [deprecated] #[arg(long, required_unless_present_any = &["fd", "socket_path", "print_capabilities"])] socket: Option, /// vhost-user socket path #[arg(long = "socket-path", required_unless_present_any = &["fd", "socket", "print_capabilities"])] socket_path: Option, /// Name of group for the vhost-user socket #[arg(long = "socket-group", conflicts_with_all = &["fd", "print_capabilities"])] socket_group: Option, /// File descriptor for the listening (not yet connected) socket #[arg(long, required_unless_present_any = &["socket", "socket_path", "print_capabilities"], conflicts_with_all = &["socket_path", "socket"])] fd: Option, /// Maximum thread pool size. A value of "0" disables the pool #[arg(long, default_value = "0")] thread_pool_size: usize, /// Enable support for extended attributes #[arg(long)] xattr: bool, /// Enable support for posix ACLs (implies --xattr) #[arg(long)] posix_acl: bool, /// Add custom rules for translating extended attributes between host and guest /// (e.g. :map::user.virtiofs.:) #[arg(long, value_parser = |s: &_| XattrMap::try_from(s))] xattrmap: Option, /// Sandbox mechanism to isolate the daemon process (namespace, chroot, none) #[arg(long, default_value = "namespace")] sandbox: SandboxMode, /// Prevent the guest from making modifications to the filesystem. #[arg(long)] readonly: bool, /// Action to take when seccomp finds a not allowed syscall (none, kill, log, trap) #[arg(long, value_parser = parse_seccomp, default_value = "kill")] seccomp: SeccompAction, /// Tell the guest which directories are mount points [default] #[arg(long)] announce_submounts: bool, /// Do not tell the guest which directories are mount points #[arg(long, overrides_with("announce_submounts"))] no_announce_submounts: bool, /// When to use file handles to reference inodes instead of O_PATH file descriptors (never, /// prefer, mandatory) /// /// - never: Never use file handles, always use O_PATH file descriptors. /// /// - prefer: Attempt to generate file handles, but fall back to O_PATH file descriptors where /// the underlying filesystem does not support file handles. Useful when there are various /// different filesystems under the shared directory and some of them do not support file /// handles. ("fallback" is a deprecated alias for "prefer".) /// /// - mandatory: Always use file handles, never fall back to O_PATH file descriptors. /// /// Using file handles reduces the number of file descriptors virtiofsd keeps open, which is /// not only helpful with resources, but may also be important in cases where virtiofsd should /// only have file descriptors open for files that are open in the guest, e.g. to get around /// bad interactions with NFS's silly renaming. #[arg(long, require_equals = true, default_value = "never")] inode_file_handles: InodeFileHandlesCommandLineMode, /// The caching policy the file system should use (auto, always, never, metadata) #[arg(long, default_value = "auto")] cache: CachePolicy, /// When used with --cache={metadata, never} will allow shared files to be mmap'd. /// Regardless of the selected cache policy, this option should only be enabled /// when the file system has exclusive access to the directory. #[arg(long)] allow_mmap: bool, /// Disable support for READDIRPLUS operations #[arg(long)] no_readdirplus: bool, /// Enable writeback cache #[arg(long)] writeback: bool, /// Honor the O_DIRECT flag passed down by guest applications #[arg(long)] allow_direct_io: bool, /// Print vhost-user.json backend program capabilities and exit #[arg(long = "print-capabilities")] print_capabilities: bool, /// Modify the list of capabilities, e.g., --modcaps=+sys_admin:-chown #[arg(long)] modcaps: Option, /// Log level (error, warn, info, debug, trace, off) #[arg(long = "log-level", default_value = "info")] log_level: LevelFilter, /// Log to syslog [default: stderr] #[arg(long)] syslog: bool, /// Set maximum number of file descriptors (0 leaves rlimit unchanged) /// [default: min(1000000, '/proc/sys/fs/nr_open')] #[arg(long = "rlimit-nofile")] rlimit_nofile: Option, /// Options in a format compatible with the legacy implementation [deprecated] #[arg(short = 'o')] compat_options: Option>, /// Set log level to "debug" [deprecated] #[arg(short = 'd')] compat_debug: bool, /// Disable KILLPRIV V2 support [default] #[arg(long)] _no_killpriv_v2: bool, /// Enable KILLPRIV V2 support #[arg(long, overrides_with("_no_killpriv_v2"))] killpriv_v2: bool, /// Compatibility option that has no effect [deprecated] #[arg(short = 'f')] compat_foreground: bool, /// Enable security label support. Expects SELinux xattr on file creation /// from client and stores it in the newly created file. #[arg(long = "security-label")] security_label: bool, /// Map a range of UIDs from the host into the namespace, given as /// :namespace_uid:host_uid:count: /// /// As opposed to '--translate-uid', this mapping is not done by virtiofsd, but by the /// user namespace into which virtiofsd is placed via '--sandbox=namespace'. /// /// For example, :0:100000:65536: will map the 65536 host UIDs [100000, 165535] /// into the namespace as [0, 65535]. /// /// Provide this argument multiple times to map multiple UID ranges. #[arg(long)] uid_map: Vec, /// Map a range of GIDs from the host into the namespace, given as /// :namespace_gid:host_gid:count: /// /// As opposed to '--translate-gid', this mapping is not done by virtiofsd, but by the /// user namespace into which virtiofsd is placed via '--sandbox=namespace'. /// /// For example, :0:100000:65536: will map the 65536 host GIDs [100000, 165535] /// into the namespace as [0, 65535]. /// /// Provide this argument multiple times to map multiple GID ranges. #[arg(long)] gid_map: Vec, /// Describe how to translate UIDs between guest and host, given as /// ':::'. /// /// As opposed to '--uid-map', this mapping is done internally by virtiofsd, and does not /// require using a user namespace. /// /// 'type' describes how to do the mapping, and in which direction: /// /// - 'guest': 1:1 map a range of guest UIDs to host UIDs /// /// - 'host': 1:1 map a range of host UIDs to guest UIDs /// /// - 'squash-guest': n:1 map a range of guest UIDs all to a single host UID /// /// - 'squash-host': n:1 map a range of host UIDs all to a single guest UID /// /// - 'forbid-guest': Forbid guest UIDs in the given range: Return an error to the guest /// whenever it tries to create a file with such a UID or make a file have such a UID /// /// - 'map': bidirectionally 1:1 map between a range of guest UIDs and host UIDs; the /// order is: 'map:::' /// /// Provide this argument multiple times to map multiple UID ranges. /// /// Cannot be used together with --posix-acl; translating UIDs (or GIDs) in virtiofsd would /// break posix ACLs. #[arg(long, conflicts_with = "posix_acl")] translate_uid: Vec, /// Same as '--translate-uid', but for GIDs. #[arg(long, conflicts_with = "posix_acl")] translate_gid: Vec, /// Preserve O_NOATIME behavior, otherwise automatically clean up O_NOATIME flag to prevent /// potential permission errors when running in unprivileged mode (e.g., when accessing files /// without having ownership/capability to use O_NOATIME). #[arg(long = "preserve-noatime")] preserve_noatime: bool, /// Defines how to perform migration, i.e. how to represent the internal state to the /// destination, and how to obtain that representation. /// /// - find-paths: Obtain paths for all inodes indexed and opened by the guest, and transfer /// those paths to the destination. To get those paths, we try to read the symbolic links in /// /proc/self/fd first; if that does not work, we will fall back to iterating through the /// shared directory (exhaustive search), enumerating all paths within. /// /// - file-handles: Pass file handles. For this to work, source and destination instance must /// operate on exactly the same shared directory on the same filesystem (which may be a /// network filesystem, mounted on different hosts). The destination instance must have the /// capability to open file handles, i.e. CAP_DAC_READ_SEARCH -- generally, this requires /// running virtiofsd as root and use `--modcaps=+dac_read_search`. /// /// This parameter is ignored on the destination side. #[arg(long = "migration-mode", default_value = "find-paths")] migration_mode: MigrationMode, /// Controls how to respond to errors during migration. /// /// If any inode turns out not to be migrateable (either the source cannot serialize it, or the /// destination cannot opened the serialized representation), the destination can react in /// different ways: /// /// - abort: Whenever any error occurs, return a hard error to the vhost-user front-end (e.g. /// QEMU), aborting migration. /// /// - guest-error: Let migration finish, but the guest will be unable to access any of the /// affected inodes, receiving only errors. /// /// This parameter is ignored on the source side. #[arg(long = "migration-on-error", default_value = "abort")] migration_on_error: MigrationOnError, /// Only for find-paths migration mode: Ensure that the migration destination opens the very /// same inodes as the source (only works if source and destination use the same shared /// directory on the same filesystem). /// /// This option makes the source attach the respective file handle to each inode transferred /// during migration. Once the destination has (re-)opened the inode, it will generate the /// file handle on its end, and compare, ensuring that it has opened the very same inode. /// /// (File handles are per-filesystem unique identifiers for inodes that, besides the inode ID, /// also include a generation ID to protect against inode ID reuse.) /// /// Using this option protects against external parties renaming or replacing inodes /// while migration is ongoing, which, without this option, can lead to data loss or /// corruption, so it should always be used when other processes besides virtiofsd have write /// access to the shared directory. However, again, it only works if both source and /// destination use the same shared directory. /// /// This parameter is ignored on the destination side. #[arg(long = "migration-verify-handles")] migration_verify_handles: bool, /// Only for find-paths migration mode: Double-check the identity of inodes right before /// switching over to the destination, potentially making migration more resilient when third /// parties have write access to the shared directory. /// /// When representing migrated inodes using their paths relative to the shared directory, /// double-check during switch-over to the destination that each path still matches the /// respective inode, and on mismatch, try to correct it via the respective symlink in /// /proc/self/fd. /// /// Because this option requires accessing each inode indexed or opened by the guest, it can /// prolong the switch-over phase of migration (when both source and destination are paused) /// for an indeterminate amount of time. /// /// This parameter is ignored on the destination side. #[arg(long = "migration-confirm-paths")] migration_confirm_paths: bool, } fn parse_compat(opt: Opt) -> Opt { use clap::error::ErrorKind; fn value_error(arg: &str, value: &str) -> ! { ::command() .error( ErrorKind::InvalidValue, format!("Invalid compat value '{value}' for '-o {arg}'"), ) .exit() } fn argument_error(arg: &str) -> ! { ::command() .error( ErrorKind::UnknownArgument, format!("Invalid compat argument '-o {arg}'"), ) .exit() } fn parse_tuple(opt: &mut Opt, tuple: &str) { match tuple.split('=').collect::>()[..] { ["xattrmap", value] => { opt.xattrmap = Some( XattrMap::try_from(value).unwrap_or_else(|_| value_error("xattrmap", value)), ) } ["cache", value] => match value { "auto" => opt.cache = CachePolicy::Auto, "always" => opt.cache = CachePolicy::Always, "none" => opt.cache = CachePolicy::Never, "metadata" => opt.cache = CachePolicy::Metadata, _ => value_error("cache", value), }, ["loglevel", value] => match value { "debug" => opt.log_level = LevelFilter::Debug, "info" => opt.log_level = LevelFilter::Info, "warn" => opt.log_level = LevelFilter::Warn, "err" => opt.log_level = LevelFilter::Error, _ => value_error("loglevel", value), }, ["sandbox", value] => match value { "namespace" => opt.sandbox = SandboxMode::Namespace, "chroot" => opt.sandbox = SandboxMode::Chroot, _ => value_error("sandbox", value), }, ["source", value] => opt.shared_dir = Some(value.to_string()), ["modcaps", value] => opt.modcaps = Some(value.to_string()), _ => argument_error(tuple), } } fn parse_single(opt: &mut Opt, option: &str) { match option { "xattr" => opt.xattr = true, "no_xattr" => opt.xattr = false, "readdirplus" => opt.no_readdirplus = false, "no_readdirplus" => opt.no_readdirplus = true, "writeback" => opt.writeback = true, "no_writeback" => opt.writeback = false, "allow_direct_io" => opt.allow_direct_io = true, "no_allow_direct_io" => opt.allow_direct_io = false, "announce_submounts" => opt.announce_submounts = true, "killpriv_v2" => opt.killpriv_v2 = true, "no_killpriv_v2" => opt.killpriv_v2 = false, "posix_acl" => opt.posix_acl = true, "no_posix_acl" => opt.posix_acl = false, "security_label" => opt.security_label = true, "no_security_label" => opt.security_label = false, "no_posix_lock" | "no_flock" => (), _ => argument_error(option), } } let mut clean_opt = opt.clone(); if let Some(compat_options) = opt.compat_options.as_ref() { for line in compat_options { for option in line.split(',') { if option.contains('=') { parse_tuple(&mut clean_opt, option); } else { parse_single(&mut clean_opt, option); } } } } clean_opt } fn print_capabilities() { println!("{{"); println!(" \"type\": \"fs\","); println!(" \"features\": ["); println!(" \"migrate-precopy\","); println!(" \"separate-options\""); println!(" ]"); println!("}}"); } fn set_default_logger(log_level: LevelFilter) { if env::var("RUST_LOG").is_err() { env::set_var("RUST_LOG", log_level.to_string()); } env_logger::init(); } fn initialize_logging(opt: &Opt) { let log_level = if opt.compat_debug { LevelFilter::Debug } else { opt.log_level }; if opt.syslog { if let Err(e) = syslog::init(syslog::Facility::LOG_USER, log_level, None) { set_default_logger(log_level); warn!("can't enable syslog: {}", e); } } else { set_default_logger(log_level); } } fn set_signal_handlers() { use vmm_sys_util::signal; extern "C" fn handle_signal(_: libc::c_int, _: *mut libc::siginfo_t, _: *mut libc::c_void) { unsafe { libc::_exit(1) }; } let signals = vec![libc::SIGHUP, libc::SIGTERM]; for s in signals { if let Err(e) = signal::register_signal_handler(s, handle_signal) { error!("Setting signal handlers: {}", e); process::exit(1); } } } fn parse_modcaps( default_caps: Vec<&str>, modcaps: Option, ) -> (HashSet, HashSet) { let mut required_caps: HashSet = default_caps.iter().map(|&s| s.into()).collect(); let mut disabled_caps = HashSet::new(); if let Some(modcaps) = modcaps { for modcap in modcaps.split(':').map(str::to_string) { if modcap.is_empty() { error!("empty modcap found: expected (+|-)capability:..."); process::exit(1); } let (action, cap_name) = modcap.split_at(1); let cap_name = cap_name.to_uppercase(); if !matches!(action, "+" | "-") { error!( "invalid modcap action: expecting '+'|'-' but found '{}'", action ); process::exit(1); } if let Err(error) = capng::name_to_capability(&cap_name) { error!("invalid capability '{}': {}", &cap_name, error); process::exit(1); } match action { "+" => { disabled_caps.remove(&cap_name); required_caps.insert(cap_name); } "-" => { required_caps.remove(&cap_name); disabled_caps.insert(cap_name); } _ => unreachable!(), } } } (required_caps, disabled_caps) } fn drop_capabilities(inode_file_handles: InodeFileHandlesMode, modcaps: Option) { let default_caps = vec![ "CHOWN", "DAC_OVERRIDE", "FOWNER", "FSETID", "SETGID", "SETUID", "MKNOD", "SETFCAP", ]; let (mut required_caps, disabled_caps) = parse_modcaps(default_caps, modcaps); if inode_file_handles != InodeFileHandlesMode::Never { let required_cap = "DAC_READ_SEARCH".to_owned(); if disabled_caps.contains(&required_cap) { error!( "can't disable {} when using --inode-file-handles={:?}", &required_cap, inode_file_handles ); process::exit(1); } required_caps.insert(required_cap); } capng::clear(capng::Set::BOTH); // Configure the required set of capabilities for the child, and leave the // parent with none. if let Err(e) = capng::updatev( capng::Action::ADD, capng::Type::PERMITTED | capng::Type::EFFECTIVE, required_caps.iter().map(String::as_str).collect(), ) { error!("can't set up the child capabilities: {}", e); process::exit(1); } if let Err(e) = capng::apply(capng::Set::BOTH) { error!("can't apply the child capabilities: {}", e); process::exit(1); } } fn main() { let opt = parse_compat(Opt::parse()); // Enable killpriv_v2 only if user explicitly asked for it by using // --killpriv-v2 or -o killpriv_v2. Otherwise disable it by default. let killpriv_v2 = opt.killpriv_v2; // Disable announce submounts if the user asked for it let announce_submounts = !opt.no_announce_submounts; if opt.print_capabilities { print_capabilities(); return; } initialize_logging(&opt); set_signal_handlers(); let shared_dir = match opt.shared_dir.as_ref() { Some(s) => s, None => { error!("missing \"--shared-dir\" or \"-o source\" option"); process::exit(1); } }; let shadir_path = Path::new(shared_dir); if !shadir_path.is_dir() && !shadir_path.is_file() { error!("{shared_dir} does not exist"); process::exit(1); } if opt.compat_foreground { warn!("Use of deprecated flag '-f': This flag has no effect, please remove it"); } if opt.compat_debug { warn!("Use of deprecated flag '-d': Please use the '--log-level debug' option instead"); } if opt.compat_options.is_some() { warn!("Use of deprecated option format '-o': Please specify options without it (e.g., '--cache auto' instead of '-o cache=auto')"); } if opt.inode_file_handles == InodeFileHandlesCommandLineMode::Fallback { warn!("Use of deprecated value 'fallback' for '--inode-file-handles': Please use 'prefer' instead"); } // Check migration argument compatibility match opt.migration_mode { MigrationMode::FindPaths => (), // all allowed MigrationMode::FileHandles => { if opt.migration_confirm_paths || opt.migration_verify_handles { if opt.migration_confirm_paths { error!("Cannot use --migration-confirm-paths with --migration-mode=file-handles (because it is unnecessary)"); } if opt.migration_verify_handles { error!("Cannot use --migration-verify-handles with --migration-mode=file-handles (because it is unnecessary)"); } process::exit(1); } } } let xattrmap = opt.xattrmap.clone(); let xattr = xattrmap.is_some() || opt.posix_acl || opt.xattr; let thread_pool_size = opt.thread_pool_size; let readdirplus = match opt.cache { CachePolicy::Never => false, _ => !opt.no_readdirplus, }; let timeout = match opt.cache { CachePolicy::Never => Duration::from_secs(0), CachePolicy::Metadata => Duration::from_secs(86400), CachePolicy::Auto => Duration::from_secs(1), CachePolicy::Always => Duration::from_secs(86400), }; let umask = if opt.socket_group.is_some() { libc::S_IROTH | libc::S_IWOTH | libc::S_IXOTH } else { libc::S_IRGRP | libc::S_IWGRP | libc::S_IXGRP | libc::S_IROTH | libc::S_IWOTH | libc::S_IXOTH }; // We need to keep _pid_file around because it maintains a lock on the pid file // that prevents another daemon from using the same pid file. let (listener, socket_path, _pid_file) = match opt.fd.as_ref() { Some(fd) => unsafe { (Listener::from_raw_fd(*fd), None, None) }, None => { // Set umask to ensure the socket is created with the right permissions let _umask_guard = oslib::ScopedUmask::new(umask); let socket = opt.socket_path.as_ref().unwrap_or_else(|| { warn!("use of deprecated parameter '--socket': Please use the '--socket-path' option instead"); opt.socket.as_ref().unwrap() // safe to unwrap because clap ensures either --socket or --socket-path are passed }); let socket_parent_dir = Path::new(socket).parent().unwrap_or_else(|| { error!("Invalid socket file name"); process::exit(1); }); if !socket_parent_dir.as_os_str().is_empty() && !socket_parent_dir.exists() { error!( "{} does not exist or is not a directory", socket_parent_dir.to_string_lossy() ); process::exit(1); } let pid_file_name = socket.to_owned() + ".pid"; let pid_file_path = Path::new(pid_file_name.as_str()); let pid_file = write_pid_file(pid_file_path).unwrap_or_else(|error| { error!("Error creating pid file '{}': {}", pid_file_name, error); process::exit(1); }); let listener = Listener::new(socket, true).unwrap_or_else(|error| { error!("Error creating listener: {}", error); process::exit(1); }); (listener, Some(socket.clone()), Some(pid_file)) } }; if let Some(group_name) = opt.socket_group { let c_name = CString::new(group_name).expect("invalid group name"); let group = unsafe { libc::getgrnam(c_name.as_ptr()) }; if group.is_null() { error!("Couldn't resolve the group name specified for the socket path"); process::exit(1); } // safe to unwrap because clap ensures --socket-group can't be specified alongside --fd let c_socket_path = CString::new(socket_path.unwrap()).expect("invalid socket path"); let ret = unsafe { libc::chown(c_socket_path.as_ptr(), u32::MAX, (*group).gr_gid) }; if ret != 0 { error!( "Couldn't set up the group for the socket path: {}", std::io::Error::last_os_error() ); process::exit(1); } } limits::setup_rlimit_nofile(opt.rlimit_nofile).unwrap_or_else(|error| { error!("Error increasing number of open files: {}", error); process::exit(1) }); let mut sandbox = Sandbox::new( shared_dir.to_string(), opt.sandbox, opt.uid_map, opt.gid_map, ) .unwrap_or_else(|error| { error!("Error creating sandbox: {}", error); process::exit(1) }); // Enter the sandbox, from this point the process will be isolated (or not) // as chosen in '--sandbox'. let listener = sandbox.enter(listener).unwrap_or_else(|error| { error!("Error entering sandbox: {}", error); process::exit(1) }); let fs_cfg = passthrough::Config { entry_timeout: timeout, attr_timeout: timeout, cache_policy: opt.cache, root_dir: sandbox.get_root_dir(), mountinfo_prefix: sandbox.get_mountinfo_prefix(), xattr, xattrmap, proc_sfd_rawfd: sandbox.get_proc_self_fd(), proc_mountinfo_rawfd: sandbox.get_mountinfo_fd(), announce_submounts, inode_file_handles: opt.inode_file_handles.into(), readdirplus, writeback: opt.writeback, allow_direct_io: opt.allow_direct_io, killpriv_v2, security_label: opt.security_label, posix_acl: opt.posix_acl, clean_noatime: !opt.preserve_noatime, allow_mmap: opt.allow_mmap, migration_on_error: opt.migration_on_error, migration_verify_handles: opt.migration_verify_handles, migration_confirm_paths: opt.migration_confirm_paths, migration_mode: opt.migration_mode, uid_map: Some(opt.translate_uid), gid_map: Some(opt.translate_gid), ..Default::default() }; // Must happen before we start the thread pool match opt.seccomp { SeccompAction::Allow => {} _ => enable_seccomp(opt.seccomp, opt.syslog).unwrap(), } // We don't modify the capabilities if the user call us without // any sandbox (i.e. --sandbox=none) as unprivileged user let uid = unsafe { libc::geteuid() }; if uid == 0 { drop_capabilities(fs_cfg.inode_file_handles, opt.modcaps); } if opt.readonly { let fs = PassthroughFsRo::new(fs_cfg).unwrap_or_else(|e| { error!("Failed to create internal filesystem representation: {e}"); process::exit(1); }); run_generic_fs(fs, listener, thread_pool_size, opt.tag); } else { let fs = PassthroughFs::new(fs_cfg).unwrap_or_else(|e| { error!("Failed to create internal filesystem representation: {e}"); process::exit(1); }); run_generic_fs(fs, listener, thread_pool_size, opt.tag); } } // Use a generic function for the main loop so we don't need to use Box fn run_generic_fs( fs: F, listener: Listener, thread_pool_size: usize, tag: Option, ) { let fs_backend = Arc::new( VhostUserFsBackendBuilder::default() .set_thread_pool_size(thread_pool_size) .set_tag(tag) .build(fs) .unwrap_or_else(|error| { error!("Error creating vhost-user backend: {}", error); process::exit(1) }), ); let mut daemon = VhostUserDaemon::new( String::from("virtiofsd-backend"), fs_backend, GuestMemoryAtomic::new(GuestMemoryMmap::new()), ) .unwrap(); info!("Waiting for vhost-user socket connection..."); if let Err(e) = daemon.start(listener) { error!("Failed to start daemon: {:?}", e); process::exit(1); } info!("Client connected, servicing requests"); if let Err(e) = daemon.wait() { match e { HandleRequest(Disconnected) => info!("Client disconnected, shutting down"), _ => error!("Waiting for daemon failed: {:?}", e), } } } virtiofsd-1.13.0/src/oslib.rs000064400000000000000000000433341046102023000141730ustar 00000000000000// SPDX-License-Identifier: BSD-3-Clause use crate::soft_idmap::{HostGid, HostUid, Id}; use bitflags::bitflags; use std::ffi::{CStr, CString}; use std::fs::File; use std::io::{self, Error, Result}; use std::os::unix::io::{AsRawFd, BorrowedFd, RawFd}; use std::os::unix::prelude::FromRawFd; // A helper function that check the return value of a C function call // and wraps it in a `Result` type, returning the `errno` code as `Err`. fn check_retval + PartialEq>(t: T) -> Result { if t == T::from(-1_i8) { Err(Error::last_os_error()) } else { Ok(t) } } /// Simple object to collect basic facts about the OS, /// such as available syscalls. pub struct OsFacts { pub has_openat2: bool, } #[allow(clippy::new_without_default)] impl OsFacts { /// This object should only be constructed using new. #[must_use] pub fn new() -> Self { // Checking for `openat2()` since it first appeared in Linux 5.6. // SAFETY: all-zero byte-pattern is a valid `libc::open_how` let how: libc::open_how = unsafe { std::mem::zeroed() }; let cwd = CString::new(".").unwrap(); // SAFETY: `cwd.as_ptr()` points to a valid NUL-terminated string, // and the `how` pointer is a valid pointer to an `open_how` struct. let fd = unsafe { libc::syscall( libc::SYS_openat2, libc::AT_FDCWD, cwd.as_ptr(), std::ptr::addr_of!(how), std::mem::size_of::(), ) }; let has_openat2 = fd >= 0; if has_openat2 { // SAFETY: `fd` is an open file descriptor unsafe { libc::close(fd as libc::c_int); } } Self { has_openat2 } } } /// Safe wrapper for `mount(2)` /// /// # Errors /// /// Will return `Err(errno)` if `mount(2)` fails. /// Each filesystem type may have its own special errors and its own special behavior, /// see `mount(2)` and the linux source kernel for details. /// /// # Panics /// /// This function panics if the strings `source`, `target` or `fstype` contain an internal 0 byte. pub fn mount(source: Option<&str>, target: &str, fstype: Option<&str>, flags: u64) -> Result<()> { let source = CString::new(source.unwrap_or("")).unwrap(); let source = source.as_ptr(); let target = CString::new(target).unwrap(); let target = target.as_ptr(); let fstype = CString::new(fstype.unwrap_or("")).unwrap(); let fstype = fstype.as_ptr(); // Safety: `source`, `target` or `fstype` are a valid C string pointers check_retval(unsafe { libc::mount(source, target, fstype, flags, std::ptr::null()) })?; Ok(()) } /// Safe wrapper for `umount2(2)` /// /// # Errors /// /// Will return `Err(errno)` if `umount2(2)` fails. /// Each filesystem type may have its own special errors and its own special behavior, /// see `umount2(2)` and the linux source kernel for details. /// /// # Panics /// /// This function panics if the strings `target` contains an internal 0 byte. pub fn umount2(target: &str, flags: i32) -> Result<()> { let target = CString::new(target).unwrap(); let target = target.as_ptr(); // Safety: `target` is a valid C string pointer check_retval(unsafe { libc::umount2(target, flags) })?; Ok(()) } /// Safe wrapper for `fchdir(2)` /// /// # Errors /// /// Will return `Err(errno)` if `fchdir(2)` fails. /// Each filesystem type may have its own special errors, see `fchdir(2)` for details. pub fn fchdir(fd: RawFd) -> Result<()> { check_retval(unsafe { libc::fchdir(fd) })?; Ok(()) } /// Safe wrapper for `fchmod(2)` /// /// # Errors /// /// Will return `Err(errno)` if `fchmod(2)` fails. /// Each filesystem type may have its own special errors, see `fchmod(2)` for details. pub fn fchmod(fd: RawFd, mode: libc::mode_t) -> Result<()> { check_retval(unsafe { libc::fchmod(fd, mode) })?; Ok(()) } /// Safe wrapper for `fchmodat(2)` /// /// # Errors /// /// Will return `Err(errno)` if `fchmodat(2)` fails. /// Each filesystem type may have its own special errors, see `fchmodat(2)` for details. pub fn fchmodat(dirfd: RawFd, pathname: String, mode: libc::mode_t, flags: i32) -> Result<()> { let pathname = CString::new(pathname).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; let pathname = pathname.as_ptr(); check_retval(unsafe { libc::fchmodat(dirfd, pathname, mode, flags) })?; Ok(()) } /// Safe wrapper for `umask(2)` pub fn umask(mask: u32) -> u32 { // SAFETY: this call doesn't modify any memory and there is no need // to check the return value because this system call always succeeds. unsafe { libc::umask(mask) } } /// An RAII implementation of a scoped file mode creation mask (umask), it set the /// new umask. When this structure is dropped (falls out of scope), it set the previous /// value of the mask. pub struct ScopedUmask { umask: libc::mode_t, } impl ScopedUmask { pub fn new(new_umask: u32) -> Self { Self { umask: umask(new_umask), } } } impl Drop for ScopedUmask { fn drop(&mut self) { umask(self.umask); } } /// Safe wrapper around `openat(2)`. /// /// # Errors /// /// Will return `Err(errno)` if `openat(2)` fails, /// see `openat(2)` for details. pub fn openat(dir: &impl AsRawFd, pathname: &CStr, flags: i32, mode: Option) -> Result { let mode = u64::from(mode.unwrap_or(0)); // SAFETY: `pathname` points to a valid NUL-terminated string. // However, the caller must ensure that `dir` can provide a valid file descriptor. check_retval(unsafe { libc::openat( dir.as_raw_fd(), pathname.as_ptr(), flags as libc::c_int, mode, ) }) } /// An utility function that uses `openat2(2)` to restrict the how the provided pathname /// is resolved. It uses the following flags: /// - `RESOLVE_IN_ROOT`: Treat the directory referred to by dirfd as the root directory while /// resolving pathname. This has the effect as though virtiofsd had used chroot(2) to modify its /// root directory to dirfd. /// - `RESOLVE_NO_MAGICLINKS`: Disallow all magic-link (i.e., proc(2) link-like files) resolution /// during path resolution. /// /// Additionally, the flags `O_NOFOLLOW` and `O_CLOEXEC` are added. /// /// # Error /// /// Will return `Err(errno)` if `openat2(2)` fails, see the man page for details. /// /// # Safety /// /// The caller must ensure that dirfd is a valid file descriptor. pub fn do_open_relative_to( dir: &impl AsRawFd, pathname: &CStr, flags: i32, mode: Option, ) -> Result { // `openat2(2)` returns an error if `how.mode` contains bits other than those in range 07777, // let's ignore the extra bits to be compatible with `openat(2)`. let mode = u64::from(mode.unwrap_or(0)) & 0o7777; // SAFETY: all-zero byte-pattern represents a valid `libc::open_how` let mut how: libc::open_how = unsafe { std::mem::zeroed() }; how.resolve = libc::RESOLVE_IN_ROOT | libc::RESOLVE_NO_MAGICLINKS; how.flags = flags as u64; how.mode = mode; // SAFETY: `pathname` points to a valid NUL-terminated string, and the `how` pointer is a valid // pointer to an `open_how` struct. However, the caller must ensure that `dir` can provide a // valid file descriptor (this can be changed to BorrowedFd). check_retval(unsafe { libc::syscall( libc::SYS_openat2, dir.as_raw_fd(), pathname.as_ptr(), std::ptr::addr_of!(how), std::mem::size_of::(), ) } as RawFd) } mod filehandle { use crate::passthrough::file_handle::SerializableFileHandle; use crate::util::other_io_error; use std::convert::{TryFrom, TryInto}; use std::io; const MAX_HANDLE_SZ: usize = 128; #[derive(Clone, PartialOrd, Ord, PartialEq, Eq)] #[repr(C)] pub struct CFileHandle { handle_bytes: libc::c_uint, handle_type: libc::c_int, f_handle: [u8; MAX_HANDLE_SZ], } impl Default for CFileHandle { fn default() -> Self { CFileHandle { handle_bytes: MAX_HANDLE_SZ as libc::c_uint, handle_type: 0, f_handle: [0; MAX_HANDLE_SZ], } } } impl CFileHandle { pub fn as_bytes(&self) -> &[u8] { &self.f_handle[..(self.handle_bytes as usize)] } pub fn handle_type(&self) -> libc::c_int { self.handle_type } } impl TryFrom<&SerializableFileHandle> for CFileHandle { type Error = io::Error; fn try_from(sfh: &SerializableFileHandle) -> io::Result { let sfh_bytes = sfh.as_bytes(); if sfh_bytes.len() > MAX_HANDLE_SZ { return Err(other_io_error("File handle too long")); } let mut f_handle = [0u8; MAX_HANDLE_SZ]; f_handle[..sfh_bytes.len()].copy_from_slice(sfh_bytes); Ok(CFileHandle { handle_bytes: sfh_bytes.len().try_into().map_err(|err| { other_io_error(format!( "Handle size ({} bytes) too big: {}", sfh_bytes.len(), err )) })?, #[allow(clippy::useless_conversion)] handle_type: sfh.handle_type().try_into().map_err(|err| { other_io_error(format!( "Handle type (0x{:x}) too large: {}", sfh.handle_type(), err )) })?, f_handle, }) } } extern "C" { pub fn name_to_handle_at( dirfd: libc::c_int, pathname: *const libc::c_char, file_handle: *mut CFileHandle, mount_id: *mut libc::c_int, flags: libc::c_int, ) -> libc::c_int; // Technically `file_handle` should be a `mut` pointer, but `open_by_handle_at()` is specified // not to change it, so we can declare it `const`. pub fn open_by_handle_at( mount_fd: libc::c_int, file_handle: *const CFileHandle, flags: libc::c_int, ) -> libc::c_int; } } pub use filehandle::CFileHandle; pub fn name_to_handle_at( dirfd: &impl AsRawFd, pathname: &CStr, file_handle: &mut CFileHandle, mount_id: &mut libc::c_int, flags: libc::c_int, ) -> Result<()> { // SAFETY: `dirfd` is a valid file descriptor, `file_handle` // is a valid reference to `CFileHandle`, and `mount_id` is // valid reference to an `int` check_retval(unsafe { filehandle::name_to_handle_at( dirfd.as_raw_fd(), pathname.as_ptr(), file_handle, mount_id, flags, ) })?; Ok(()) } pub fn open_by_handle_at( mount_fd: &impl AsRawFd, file_handle: &CFileHandle, flags: libc::c_int, ) -> Result { // SAFETY: `mount_fd` is a valid file descriptor and `file_handle` // is a valid reference to `CFileHandle` let fd = check_retval(unsafe { filehandle::open_by_handle_at(mount_fd.as_raw_fd(), file_handle, flags) })?; // SAFETY: `open_by_handle_at()` guarantees `fd` is a valid file descriptor Ok(unsafe { File::from_raw_fd(fd) }) } mod writev { //! musl does not provide a wrapper for the `pwritev2(2)` system call, //! we need to call it using `syscall(2)`. #[cfg(target_env = "gnu")] pub use libc::pwritev2; #[cfg(target_env = "musl")] pub unsafe fn pwritev2( fd: libc::c_int, iov: *const libc::iovec, iovcnt: libc::c_int, offset: libc::off_t, flags: libc::c_int, ) -> libc::ssize_t { // The `pwritev2(2)` syscall expects to receive the 64-bit offset split in // its high and low parts (see `syscall(2)`). On 64-bit architectures we // set `lo_off=offset` and `hi_off=0` (glibc does it), since `hi_off` is cleared, // so we need to make sure of not clear the higher 32 bits of `lo_off`, otherwise // the offset will be 0 on 64-bit architectures. let lo_off = offset as libc::c_long; // warn: do not clear the higher 32 bits let hi_off = (offset as u64).checked_shr(libc::c_long::BITS).unwrap_or(0) as libc::c_long; unsafe { libc::syscall(libc::SYS_pwritev2, fd, iov, iovcnt, lo_off, hi_off, flags) as libc::ssize_t } } } // We cannot use libc::RWF_HIPRI, etc, because these constants are not defined in musl. bitflags! { /// A bitwise OR of zero or more flags passed in as a parameter to the /// write vectored function `writev_at()`. pub struct WritevFlags: i32 { /// High priority write. Allows block-based filesystems to use polling of the device, which /// provides lower latency, but may use additional resources. (Currently, this feature is /// usable only on a file descriptor opened using the O_DIRECT flag.) const RWF_HIPRI = 0x00000001; /// Provide a per-write equivalent of the O_DSYNC open(2) flag. Its effect applies /// only to the data range written by the system call. const RWF_DSYNC = 0x00000002; /// Provide a per-write equivalent of the O_SYNC open(2) flag. Its effect applies only /// to the data range written by the system call. const RWF_SYNC = 0x00000004; /// Provide a per-write equivalent of the O_APPEND open(2) flag. Its effect applies only /// to the data range written by the system call. The offset argument does not affect the /// write operation; the data is always appended to the end of the file. /// However, if the offset argument is -1, the current file offset is updated. const RWF_APPEND = 0x00000010; } } #[cfg(target_env = "gnu")] mod writev_test { // Lets make sure (at compile time) that the WritevFlags don't go out of sync with the libc const _: () = assert!( super::WritevFlags::RWF_HIPRI.bits() == libc::RWF_HIPRI, "invalid RWF_HIPRI value" ); const _: () = assert!( super::WritevFlags::RWF_DSYNC.bits() == libc::RWF_DSYNC, "invalid RWF_DSYNC value" ); const _: () = assert!( super::WritevFlags::RWF_SYNC.bits() == libc::RWF_SYNC, "invalid RWF_SYNC value" ); const _: () = assert!( super::WritevFlags::RWF_APPEND.bits() == libc::RWF_APPEND, "invalid RWF_APPEND value" ); } /// Safe wrapper for `pwritev2(2)` /// /// This system call is similar `pwritev(2)`, but add a new argument, /// flags, which modifies the behavior on a per-call basis. /// Unlike `pwritev(2)`, if the offset argument is -1, then the current file offset /// is used and updated. /// /// # Errors /// /// Will return `Err(errno)` if `pwritev2(2)` fails, see `pwritev2(2)` for details. /// /// # Safety /// /// The caller must ensure that each iovec element is valid (i.e., it has a valid `iov_base` /// pointer and `iov_len`). pub unsafe fn writev_at( fd: BorrowedFd, iovecs: &[libc::iovec], offset: i64, flags: Option, ) -> Result { let flags = flags.unwrap_or(WritevFlags::empty()); // SAFETY: `fd` is a valid filed descriptor, `iov` is a valid pointer // to the iovec slice `ìovecs` of `iovcnt` elements. However, the caller // must ensure that each iovec element has a valid `iov_base` pointer and `iov_len`. let bytes_written = check_retval(unsafe { writev::pwritev2( fd.as_raw_fd(), iovecs.as_ptr(), iovecs.len() as libc::c_int, offset, flags.bits(), ) })?; Ok(bytes_written as usize) } pub struct PipeReader(File); impl io::Read for PipeReader { fn read(&mut self, buf: &mut [u8]) -> io::Result { self.0.read(buf) } } pub struct PipeWriter(File); impl io::Write for PipeWriter { fn write(&mut self, buf: &[u8]) -> io::Result { self.0.write(buf) } fn flush(&mut self) -> io::Result<()> { self.0.flush() } } pub fn pipe() -> io::Result<(PipeReader, PipeWriter)> { let mut fds: [RawFd; 2] = [-1, -1]; let ret = unsafe { libc::pipe2(fds.as_mut_ptr(), libc::O_CLOEXEC) }; if ret == -1 { Err(io::Error::last_os_error()) } else { Ok(( PipeReader(unsafe { File::from_raw_fd(fds[0]) }), PipeWriter(unsafe { File::from_raw_fd(fds[1]) }), )) } } // We want credential changes to be per-thread because otherwise // we might interfere with operations being carried out on other // threads with different uids/gids. However, posix requires that // all threads in a process share the same credentials. To do this // libc uses signals to ensure that when one thread changes its // credentials the other threads do the same thing. // // So instead we invoke the syscall directly in order to get around // this limitation. Another option is to use the setfsuid and // setfsgid systems calls. However since those calls have no way to // return an error, it's preferable to do this instead. /// Set effective user ID pub fn seteffuid(uid: HostUid) -> io::Result<()> { check_retval(unsafe { libc::syscall(libc::SYS_setresuid, -1, uid.into_inner(), -1) })?; Ok(()) } /// Set effective group ID pub fn seteffgid(gid: HostGid) -> io::Result<()> { check_retval(unsafe { libc::syscall(libc::SYS_setresgid, -1, gid.into_inner(), -1) })?; Ok(()) } /// Set supplementary group pub fn setsupgroup(gid: HostGid) -> io::Result<()> { let gid_raw = gid.into_inner(); check_retval(unsafe { libc::setgroups(1, &gid_raw) })?; Ok(()) } /// Drop all supplementary groups pub fn dropsupgroups() -> io::Result<()> { check_retval(unsafe { libc::setgroups(0, std::ptr::null()) })?; Ok(()) } virtiofsd-1.13.0/src/passthrough/credentials.rs000064400000000000000000000132661046102023000177300ustar 00000000000000// SPDX-License-Identifier: BSD-3-Clause use crate::oslib; use crate::passthrough::util::einval; use crate::soft_idmap::{HostGid, HostUid, Id}; use std::io; pub struct UnixCredentials { uid: HostUid, gid: HostGid, sup_gid: Option, keep_capability: bool, } impl UnixCredentials { pub fn new(uid: HostUid, gid: HostGid) -> Self { UnixCredentials { uid, gid, sup_gid: None, keep_capability: false, } } /// Set a supplementary group. Set `supported_extension` to `false` to signal that a /// supplementary group maybe required, but the guest was not able to tell us which, /// so we have to rely on keeping the DAC_OVERRIDE capability. pub fn supplementary_gid(self, supported_extension: bool, sup_gid: Option) -> Self { UnixCredentials { uid: self.uid, gid: self.gid, sup_gid, keep_capability: !supported_extension, } } /// Changes the effective uid/gid of the current thread to `val`. Changes /// the thread's credentials back to root when the returned struct is dropped. pub fn set(self) -> io::Result> { // Safe: Always succesful let current_uid = HostUid::from(unsafe { libc::geteuid() }); let current_gid = HostGid::from(unsafe { libc::getegid() }); // Not to change UID/GID when they’re 0 (root) is legacy behavior that we’re afraid to // change let change_uid = !self.uid.is_root() && self.uid != current_uid; let change_gid = !self.gid.is_root() && self.gid != current_gid; // We have to change the gid before we change the uid because if we // change the uid first then we lose the capability to change the gid. // However changing back can happen in any order. if let Some(sup_gid) = self.sup_gid { oslib::setsupgroup(sup_gid)?; } if change_gid { oslib::seteffgid(self.gid)?; } if change_uid { oslib::seteffuid(self.uid)?; } if change_uid && self.keep_capability { // Before kernel 6.3, we don't have access to process supplementary groups. // To work around this we can set the `DAC_OVERRIDE` in the effective set. // We are allowed to set the capability because we only change the effective // user ID, so we still have the 'DAC_OVERRIDE' in the permitted set. // After switching back to root the permitted set is copied to the effective set, // so no additional steps are required. if let Err(e) = crate::util::add_cap_to_eff("DAC_OVERRIDE") { warn!("failed to add 'DAC_OVERRIDE' to the effective set of capabilities: {e}"); } } if !change_uid && !change_gid { return Ok(None); } Ok(Some(UnixCredentialsGuard { reset_uid: change_uid.then_some(current_uid), reset_gid: change_gid.then_some(current_gid), drop_sup_gid: self.sup_gid.is_some(), })) } } pub struct UnixCredentialsGuard { reset_uid: Option, reset_gid: Option, drop_sup_gid: bool, } impl Drop for UnixCredentialsGuard { fn drop(&mut self) { if let Some(uid) = self.reset_uid { oslib::seteffuid(uid).unwrap_or_else(|e| { error!("failed to change uid back to {uid}: {e}"); }); } if let Some(gid) = self.reset_gid { oslib::seteffgid(gid).unwrap_or_else(|e| { error!("failed to change gid back to {gid}: {e}"); }); } if self.drop_sup_gid { oslib::dropsupgroups().unwrap_or_else(|e| { error!("failed to drop supplementary groups: {e}"); }); } } } pub struct ScopedCaps { cap: capng::Capability, } impl ScopedCaps { fn new(cap_name: &str) -> io::Result> { use capng::{Action, CUpdate, Set, Type}; let cap = capng::name_to_capability(cap_name).map_err(|_| { let err = io::Error::last_os_error(); error!( "couldn't get the capability id for name {}: {:?}", cap_name, err ); err })?; if capng::have_capability(Type::EFFECTIVE, cap) { let req = vec![CUpdate { action: Action::DROP, cap_type: Type::EFFECTIVE, capability: cap, }]; capng::update(req).map_err(|e| { error!("couldn't drop {} capability: {:?}", cap, e); einval() })?; capng::apply(Set::CAPS).map_err(|e| { error!( "couldn't apply capabilities after dropping {}: {:?}", cap, e ); einval() })?; Ok(Some(Self { cap })) } else { Ok(None) } } } impl Drop for ScopedCaps { fn drop(&mut self) { use capng::{Action, CUpdate, Set, Type}; let req = vec![CUpdate { action: Action::ADD, cap_type: Type::EFFECTIVE, capability: self.cap, }]; if let Err(e) = capng::update(req) { panic!("couldn't restore {} capability: {:?}", self.cap, e); } if let Err(e) = capng::apply(Set::CAPS) { panic!( "couldn't apply capabilities after restoring {}: {:?}", self.cap, e ); } } } pub fn drop_effective_cap(cap_name: &str) -> io::Result> { ScopedCaps::new(cap_name) } virtiofsd-1.13.0/src/passthrough/device_state/deserialization.rs000064400000000000000000000521721046102023000232570ustar 00000000000000// Copyright 2024 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. /*! * Deserialization functionality (i.e. what happens in * `SerializableFileSystem::deserialize_and_apply()`): Take a plain vector of bytes, deserialize * it into our serializable structs ('serialized' module), and then apply the information from * there to a `PassthroughFs`, restoring the state from the migration source. */ use crate::fuse; use crate::passthrough::device_state::preserialization::HandleMigrationInfo; use crate::passthrough::device_state::serialized; use crate::passthrough::file_handle::SerializableFileHandle; use crate::passthrough::inode_store::{InodeData, InodeIds, StrongInodeReference}; use crate::passthrough::mount_fd::MountFd; use crate::passthrough::stat::statx; use crate::passthrough::util::{openat, printable_fd}; use crate::passthrough::{ FileOrHandle, HandleData, HandleDataFile, InodeFileHandlesMode, MigrationOnError, PassthroughFs, }; use crate::util::{other_io_error, ErrorContext, ResultErrorContext}; use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; use std::io; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex, RwLock}; impl TryFrom> for serialized::PassthroughFs { type Error = io::Error; /// Root of deserialization: Turn plain bytes into a structured `serialized::PassthroughFs` fn try_from(serialized: Vec) -> io::Result { postcard::from_bytes(&serialized).map_err(other_io_error) } } impl serialized::PassthroughFsV1 { /** * Apply the state represented in `self: PassthroughFsV1` to `fs`. * * Restore the inode store, open handles, etc. */ pub(super) fn apply(self, fs: &PassthroughFs) -> io::Result<()> { self.apply_with_mount_paths(fs, HashMap::new()) } /** * Actual `apply()` implementation. * * Underlying implementation for both `PassthroughFsV1::apply()` and * `PassthroughFsV2::apply()`. Migrating file handles requires a map of source mount IDs to * paths inside the shared directory, which is not present in `PassthroughFsV1`. This function * takes this argument (`mount_paths`) explicitly, allowing `PassthroughFsV1::apply()` to pass * an empty map (not allowing migration of file handles), and `PassthroughFsV2::apply()` to * pass the map it got. */ fn apply_with_mount_paths( mut self, fs: &PassthroughFs, mount_paths: HashMap, ) -> io::Result<()> { // Apply options as negotiated with the guest on the source self.negotiated_opts.apply(fs)?; fs.inodes.clear(); let mount_fds: HashMap> = if self.inodes.is_empty() || mount_paths.is_empty() { // No nodes or mount paths given? We will not need this map, just create an empty one HashMap::new() } else { // Deserialize the root inode first; every path in `mount_paths` is relative to it, so // we must have it open to deserialize the mount FD map let Some((root_index, _)) = self .inodes .iter() .enumerate() .find(|(_, inode)| inode.id == fuse::ROOT_ID) else { return Err(other_io_error("Received no root node from the source")); }; self.inodes .swap_remove(root_index) .deserialize_root_node(fs)?; let root_node = fs.inodes.get(fuse::ROOT_ID).unwrap(); let root_node_file = root_node .get_file() .err_context(|| "Cannot open shared directory")?; mount_paths.into_iter().filter_map(|(mount_id, mount_path)| { match MountFd::new(fs.mount_fds.as_ref(), &root_node_file, &mount_path) { Ok(mount_fd) => Some((mount_id, mount_fd)), Err(err) => { warn!( "Failed to open path {mount_path} to open file handles for mount ID {mount_id}: {err}; \ will not be able to open inodes represented by file handles on that mount" ); None } } }).collect() }; // Some inodes may depend on other inodes being deserialized before them, so trying to // deserialize them without their dependency being fulfilled will return `false` below, // asking to be deferred. Therefore, it may take multiple iterations until we have // successfully deserialized all inodes. // (However serialized inodes are represented, it must be ensured that no loops occur in // such dependencies.) while !self.inodes.is_empty() { let mut i = 0; let mut processed_any = false; while i < self.inodes.len() { if self.inodes[i].deserialize_with_fs(fs, &mount_fds)? { // All good self.inodes.swap_remove(i); processed_any = true; } else { // Process this inode later (e.g. needs to resolve a reference to a parent node // that has not yet been deserialized) i += 1; } } if !processed_any { return Err(other_io_error( "Unresolved references between serialized inodes", )); } } fs.next_inode.store(self.next_inode, Ordering::Relaxed); // Reconstruct handles (i.e., open those files) *fs.handles.write().unwrap() = BTreeMap::new(); for handle in self.handles { handle.deserialize_with_fs(fs)?; } fs.next_handle.store(self.next_handle, Ordering::Relaxed); Ok(()) } } impl serialized::PassthroughFsV2 { /** * Apply the state represented in `self: PassthroughFsV2` to `fs`. * * Restore the inode store, open handles, etc. */ pub(super) fn apply(self, fs: &PassthroughFs) -> io::Result<()> { self.v1.apply_with_mount_paths(fs, self.mount_paths) } } impl serialized::NegotiatedOpts { /// Apply the options negotiated with the guest on the source side to `fs`'s configuration fn apply(self, fs: &PassthroughFs) -> io::Result<()> { if !fs.cfg.writeback && self.writeback { return Err(other_io_error( "Migration source wants writeback enabled, but it is disabled on the destination", )); } // Note the case of `fs.cfg.writeback && !self.writeback`, i.e. the user asked for it to be // enabled, but the migration source had it disabled: From a technical perspective, just // disabling it here is fine, because that is what happens (and what we want to happen) // when the guest does not support the flag (in which case there will already have been a // warning on INIT). However, it is imaginable that the guest supports the flag, but it // was user-disabled on the source (and is user-enabled now): We can't distinguish this // case from the no-guest-support one, and disabling the flag is still the right thing to // do, because we would need to re-negotiate through INIT first before we can enable it. // Given that it would be strange for the user to use different configurations for source // and destination, do not print a warning either. fs.writeback.store(self.writeback, Ordering::Relaxed); if !fs.cfg.announce_submounts && self.announce_submounts { return Err(other_io_error( "Migration source wants announce-submounts enabled, but it is disabled on the \ destination", )); } // The comment from writeback applies here, too fs.announce_submounts .store(self.announce_submounts, Ordering::Relaxed); if !fs.cfg.posix_acl && self.posix_acl { return Err(other_io_error( "Migration source wants posix ACLs enabled, but it is disabled on the destination", )); } // The comment from writeback applies here, too fs.posix_acl.store(self.posix_acl, Ordering::Relaxed); fs.sup_group_extension .store(self.sup_group_extension, Ordering::Relaxed); Ok(()) } } impl serialized::Inode { /// Deserialize this inode into `fs`'s inode store. Return `Ok(true)` on success, `Err(_)` on /// error, and `Ok(false)` when there is a dependency to another inode that has not yet been /// deserialized, so deserialization should be re-attempted later. fn deserialize_with_fs( &self, fs: &PassthroughFs, mount_fds: &HashMap>, ) -> io::Result { match &self.location { serialized::InodeLocation::RootNode => { if self.id != fuse::ROOT_ID { return Err(other_io_error(format!( "Node with non-root ID ({}) given as root node", self.id ))); } self.deserialize_root_node(fs)?; Ok(true) } serialized::InodeLocation::Path { parent, filename } => { if self.id == fuse::ROOT_ID { return Err(other_io_error( "Refusing to use path given for root node".to_string(), )); } let parent_ref = match fs.inodes.get(*parent) { None => { // `parent` not found yet, defer deserialization until it is present return Ok(false); } Some(parent_data) => { // Safe because the migration source guarantees that this reference is // included in the parent node's refcount. Once we have deserialized this // inode, we must drop that reference, and moving it into // `deserialize_path()` will achieve that. unsafe { StrongInodeReference::new_no_increment(parent_data, &fs.inodes) } } }; let inode_data = self .deserialize_path(fs, parent_ref, filename) .or_else(|err| self.deserialize_invalid_inode(fs, err))?; let inode_data = match self.check_file_handle(&inode_data) { Ok(()) => inode_data, Err(err) => self.deserialize_invalid_inode(fs, err)?, }; fs.inodes.new_inode(inode_data)?; Ok(true) } serialized::InodeLocation::Invalid => { let err = io::Error::new( io::ErrorKind::NotFound, format!("Migration source has lost inode {}", self.id), ); let inode_data = self.deserialize_invalid_inode(fs, err)?; fs.inodes.new_inode(inode_data)?; Ok(true) } serialized::InodeLocation::FullPath { filename } => { if self.id == fuse::ROOT_ID { return Err(other_io_error( "Refusing to use path given for root node".to_string(), )); } let Ok(shared_dir) = fs.inodes.get_strong(fuse::ROOT_ID) else { // No root node? Defer until we have it. return Ok(false); }; let inode_data = self .deserialize_path(fs, shared_dir, filename) .or_else(|err| self.deserialize_invalid_inode(fs, err))?; fs.inodes.new_inode(inode_data)?; Ok(true) } serialized::InodeLocation::FileHandle { handle } => { if self.id == fuse::ROOT_ID { return Err(other_io_error( "Refusing to use file handle given for root node".to_string(), )); } let inode_data = self .deserialize_file_handle(fs, mount_fds, handle) .or_else(|err| self.deserialize_invalid_inode(fs, err))?; fs.inodes.new_inode(inode_data)?; Ok(true) } } } /** * “Deserialize” the root node. * * We will not get any information about it from the source because its location is always * defined on the command line, so all we do is open that location and apply the refcount the * source had for it. * * `self.id` must be the FUSE root inode ID. */ fn deserialize_root_node(&self, fs: &PassthroughFs) -> io::Result<()> { assert!(self.id == fuse::ROOT_ID); if !matches!(&self.location, serialized::InodeLocation::RootNode) { return Err(other_io_error( "Root node has not been serialized as root node", )); } // We open the root node ourselves (from the configuration the user gave us)... fs.open_root_node()?; // ...and only take the refcount from the source, ignoring filename and parent information. // Note that we must not call `fs.open_root_node()` before we have the correct refcount, or // deserializing child nodes (which drops one reference each) would quickly reduce the // refcount below 0. let root_data = fs.inodes.get(fuse::ROOT_ID).unwrap(); root_data.refcount.store(self.refcount, Ordering::Relaxed); // For the root node, a non-matching file handle is always a hard error. We cannot // deserialize the root node as an invalid node. self.check_file_handle(&root_data)?; Ok(()) } /// Helper function for `deserialize_with_fs()`: Try to locate an inode based on its parent /// directory and its filename. /// Takes ownership of the `parent` strong reference and drops it. /// On success, returns `InodeData` to add to `fs.inodes`. fn deserialize_path( &self, fs: &PassthroughFs, parent: StrongInodeReference, filename: &str, ) -> io::Result { let parent_fd = parent.get().get_file()?; let fd = openat( &parent_fd, filename, libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC, ) .map_err(|err| { let pfd = printable_fd(&parent_fd, Some(&fs.proc_self_fd)); io::Error::new( err.kind(), format!( "Opening {}{}{}: {}", pfd, if pfd.ends_with('/') { "" } else { "/" }, filename, err ), ) })?; let st = statx(&fd, None)?; let handle = fs.get_file_handle_opt(&fd, &st)?; let file_or_handle = if let Some(h) = handle.as_ref() { FileOrHandle::Handle(fs.make_file_handle_openable(h)?) } else { FileOrHandle::File(fd) }; Ok(InodeData { inode: self.id, file_or_handle, refcount: AtomicU64::new(self.refcount), ids: InodeIds { ino: st.st.st_ino, dev: st.st.st_dev, mnt_id: st.mnt_id, }, mode: st.st.st_mode, migration_info: Mutex::new(None), }) } /// Helper function for `deserialize_with_fs()`: Handle invalid inodes, i.e. ones that cannot /// be located. /// Depending on the configuration, they either cause a hard error, or should be added as /// explicitly invalid inodes to `fs.inodes` (in which case their `InodeData` is returned). fn deserialize_invalid_inode( &self, fs: &PassthroughFs, err: io::Error, ) -> io::Result { match fs.cfg.migration_on_error { MigrationOnError::Abort => Err(err.context(format!("Inode {}", self.id))), MigrationOnError::GuestError => { warn!("Invalid inode {} indexed: {}", self.id, err); Ok(InodeData { inode: self.id, file_or_handle: FileOrHandle::Invalid(Arc::new(err)), refcount: AtomicU64::new(self.refcount), ids: Default::default(), mode: Default::default(), migration_info: Default::default(), }) } } } /// If the source sent us a reference file handle, check it against `inode_data`'s file handle fn check_file_handle(&self, inode_data: &InodeData) -> io::Result<()> { let Some(ref_fh) = &self.file_handle else { return Ok(()); }; let is_fh: SerializableFileHandle = (&inode_data.file_or_handle).try_into()?; // Disregard the mount ID, this may be a different host, so the mount ID may differ is_fh.require_equal_without_mount_id(ref_fh).map_err(|err| { other_io_error(format!( "Inode {} is not the same inode as in the migration source: {}", self.id, err )) }) } /** * Helper function for `deserialize_with_fs()`: Handle file handles. * * Get a mount FD for the given file handle, turning it into an * [`OpenableFileHandle`](crate::passthrough::file_handle::OpenableFileHandle). Then get the * [`InodeIds`] we need to complete the [`InodeData`] object, and return that. */ fn deserialize_file_handle( &self, fs: &PassthroughFs, mount_fds: &HashMap>, handle: &SerializableFileHandle, ) -> io::Result { let source_mount_id = handle.mount_id(); let mfd = mount_fds .get(&source_mount_id) .ok_or_else(|| other_io_error(format!("Unknown mount ID {source_mount_id}")))?; let ofh = handle.to_openable(Arc::clone(mfd))?; let fd = ofh .open(libc::O_PATH) .err_context(|| "Opening file handle")?; let st = statx(&fd, None).err_context(|| "stat")?; let file_or_handle = match fs.cfg.inode_file_handles { InodeFileHandlesMode::Never => FileOrHandle::File(fd), InodeFileHandlesMode::Mandatory | InodeFileHandlesMode::Prefer => { FileOrHandle::Handle(ofh) } }; Ok(InodeData { inode: self.id, file_or_handle, refcount: AtomicU64::new(self.refcount), ids: InodeIds { ino: st.st.st_ino, dev: st.st.st_dev, mnt_id: st.mnt_id, }, mode: st.st.st_mode, migration_info: Mutex::new(None), }) } } impl serialized::Handle { /// Deserialize this handle into `fs`'s handle map. fn deserialize_with_fs(&self, fs: &PassthroughFs) -> io::Result<()> { let inode = fs .inodes .get(self.inode) .ok_or_else(|| other_io_error(format!("Inode {} not found", self.inode)))?; let (file, migration_info) = match self.source { serialized::HandleSource::OpenInode { flags } => { let handle_data_file = match inode .open_file(flags, &fs.proc_self_fd) .and_then(|f| f.into_file()) { Ok(f) => HandleDataFile::File(RwLock::new(f)), Err(err) => { let error_msg = if let Ok(path) = inode.get_path(&fs.proc_self_fd) { let p = path.as_c_str().to_string_lossy(); format!( "Opening inode {} ({}) as handle {}: {}", self.inode, p, self.id, err ) } else { format!( "Opening inode {} as handle {}: {}", self.inode, self.id, err ) }; let err = io::Error::new(err.kind(), error_msg); match fs.cfg.migration_on_error { MigrationOnError::Abort => return Err(err), MigrationOnError::GuestError => { warn!("Invalid handle {} is open in guest: {}", self.id, err); HandleDataFile::Invalid(Arc::new(err)) } } } }; let migration_info = HandleMigrationInfo::OpenInode { flags }; (handle_data_file, migration_info) } }; let handle_data = HandleData { inode: self.inode, file, migration_info, }; fs.handles .write() .unwrap() .insert(self.id, Arc::new(handle_data)); Ok(()) } } virtiofsd-1.13.0/src/passthrough/device_state/mod.rs000064400000000000000000000110071046102023000206400ustar 00000000000000// Copyright 2024 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. /*! * Module for migrating our internal FS state (i.e. serializing and deserializing it), with the * following submodules: * - serialized: Serialized data structures * - preserialization: Structures and functionality for preparing for migration (serialization), * i.e. define and construct the precursors to the eventually serialized * information that are stored alongside the associated inodes and handles they * describe * - serialization: Functionality for serializing * - deserialization: Functionality for deserializing */ mod deserialization; pub(super) mod preserialization; mod serialization; mod serialized; use crate::filesystem::SerializableFileSystem; use crate::passthrough::{MigrationMode, PassthroughFs}; use preserialization::proc_paths::{self, ConfirmPaths, ImplicitPathCheck}; use preserialization::{file_handles, find_paths}; use std::convert::{TryFrom, TryInto}; use std::fs::File; use std::io::{self, Read, Write}; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; /// Adds serialization (migration) capabilities to `PassthroughFs` impl SerializableFileSystem for PassthroughFs { fn prepare_serialization(&self, cancel: Arc) { self.inodes.clear_migration_info(); // Set this so the filesystem code knows that every node is supposed to have up-to-date // migration information. For example, nodes that are created after they would have been // visited by the reconstructor below will not get migration info, unless the general // filesystem code makes an effort to set it (when the node is created). self.track_migration_info.store(true, Ordering::Relaxed); match self.cfg.migration_mode { MigrationMode::FindPaths => { // Create the reconstructor (which reconstructs parent+filename information for // each node in our inode store), and run it. Try the proc_paths module first, if // that advises us to fall back, try find_paths second. if proc_paths::Constructor::new(self, Arc::clone(&cancel)).execute() { warn!("Falling back to iterating through the shared directory to reconstruct paths for migration"); find_paths::Constructor::new(self, Arc::clone(&cancel)).execute(); } } MigrationMode::FileHandles => { // Get file handles for each node in our inode store file_handles::Constructor::new(self, Arc::clone(&cancel)).execute(); } } // Check reconstructed paths once. This is to rule out TOCTTOU problems, specifically the // following: // 1. Our preserialization constructor above finds a path for some inode // 2. That inode is concurrently unlinked by the guest, so its inode migration info is // invalidated // 3. The preserialization constructor then constructs an inode migration info with the // path it found (that is now wrong), adding it to the inode // To fix this problem, preserialization must re-check each path after putting it into the // `InodeData.migration_info` field. Do that by running the proc_paths checker. let checker = ImplicitPathCheck::new(self, cancel); checker.check_paths(); } fn serialize(&self, mut state_pipe: File) -> io::Result<()> { self.track_migration_info.store(false, Ordering::Relaxed); if self.cfg.migration_confirm_paths { let checker = ConfirmPaths::new(self); if let Err(err) = checker.confirm_paths() { self.inodes.clear_migration_info(); return Err(err); } } let state = serialized::PassthroughFs::V2(self.into()); self.inodes.clear_migration_info(); let serialized: Vec = state.try_into()?; state_pipe.write_all(&serialized)?; Ok(()) } fn deserialize_and_apply(&self, mut state_pipe: File) -> io::Result<()> { let mut serialized: Vec = Vec::new(); state_pipe.read_to_end(&mut serialized)?; match serialized::PassthroughFs::try_from(serialized)? { serialized::PassthroughFs::V1(state) => state.apply(self)?, serialized::PassthroughFs::V2(state) => state.apply(self)?, }; Ok(()) } } virtiofsd-1.13.0/src/passthrough/device_state/preserialization/file_handles.rs000064400000000000000000000114021046102023000260610ustar 00000000000000// Copyright 2024 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. //! Preserialization implementation to represent all inodes as their file handles. use super::{InodeLocation, InodeMigrationInfo}; use crate::passthrough::file_handle::{self, FileOrHandle, SerializableFileHandle}; use crate::passthrough::inode_store::{InodeData, StrongInodeReference}; use crate::passthrough::PassthroughFs; use crate::util::ResultErrorContext; use std::fmt::{self, Display}; use std::io; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; /// The result of *file-handles* pre-serialization: A file handle. pub(in crate::passthrough) struct FileHandle { /** * The file handle. * * Its mount ID is only valid on the migration source (i.e. here, because the source is where * preserialization occurs). */ pub handle: SerializableFileHandle, } /** * Construct file handles during preserialization. * * Generate a file handle for all inodes that don’t have a migration info set yet. */ pub(in crate::passthrough::device_state) struct Constructor<'a> { /// Reference to the filesystem for which to reconstruct inodes’ paths. fs: &'a PassthroughFs, /// Set to true when we are supposed to cancel. cancel: Arc, } impl<'a> Constructor<'a> { /// Prepare to collect file handles for `fs`’s inodes. pub fn new(fs: &'a PassthroughFs, cancel: Arc) -> Self { Constructor { fs, cancel } } /** * Collect file handles for all inodes in our inode store, during preserialization. * * Recurse from the root directory (the shared directory), constructing `InodeMigrationInfo` * data for every inode in the inode store. This may take a long time, which is why it is done * in the preserialization phase. * * Cannot fail: Collecting inodes’ migration info is supposed to be a best-effort operation. * We can leave any and even all inodes’ migration info empty, then serialize them as invalid * inodes, and let the destination decide what to do based on its `--migration-on-error` * setting. */ pub fn execute(self) { for inode_data in self.fs.inodes.iter() { if self.cancel.load(Ordering::Relaxed) { break; } // Migration info is automatically cleared before `execute()`, so if we find migration // info here, it must be up-to-date, and we don't need to fill it in. if inode_data.migration_info.lock().unwrap().is_some() { continue; } if let Err(err) = self.set_migration_info(&inode_data) { error!( "Inode {} ({}): {}", inode_data.inode, inode_data.identify(&self.fs.proc_self_fd), err ); } } } } impl FileHandle { /// Trivial constructor. pub fn new(handle: SerializableFileHandle) -> Self { FileHandle { handle } } /** * Call `f` for each [`StrongInodeReference`] we have in `self`. * * File handles never contain references to other inodes, so this is a no-op. */ pub(super) fn for_each_strong_reference(self, _f: F) {} } impl From for InodeLocation { fn from(fh: FileHandle) -> Self { InodeLocation::FileHandle(fh) } } impl Display for FileHandle { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "[file handle: {}]", self.handle) } } impl Constructor<'_> { /** * Set `inode_data`’s migration info to its file handle. * * Try to generate a file handle for `inode_data` (or use the one we already have, if any), and * construct and set its migration info based on it. */ fn set_migration_info(&self, inode_data: &InodeData) -> io::Result<()> { let handle: SerializableFileHandle = match &inode_data.file_or_handle { FileOrHandle::File(file) => file_handle::FileHandle::from_fd_fail_hard(file) .err_context(|| "Failed to generate file handle")? .into(), FileOrHandle::Handle(handle) => handle.inner().into(), FileOrHandle::Invalid(err) => return Err(io::Error::new( err.kind(), format!("Inode is invalid because of an error during the preceding migration, which was: {err}"), )), }; let mig_info = InodeMigrationInfo::new_internal( &self.fs.cfg, FileHandle::new(handle.clone()), || Ok(handle), )?; *inode_data.migration_info.lock().unwrap() = Some(mig_info); Ok(()) } } virtiofsd-1.13.0/src/passthrough/device_state/preserialization/find_paths.rs000064400000000000000000000310661046102023000255730ustar 00000000000000// Copyright 2024 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use super::{InodeLocation, InodeMigrationInfo}; use crate::filesystem::DirectoryIterator; use crate::fuse; use crate::passthrough::file_handle::{FileHandle, SerializableFileHandle}; use crate::passthrough::inode_store::{InodeData, InodeIds, StrongInodeReference}; use crate::passthrough::stat::statx; use crate::passthrough::{FileOrHandle, PassthroughFs}; use crate::read_dir::ReadDir; use crate::util::{other_io_error, ResultErrorContext}; use std::convert::{TryFrom, TryInto}; use std::ffi::{CStr, CString}; use std::fmt::{self, Display}; use std::fs::File; use std::io; use std::os::unix::io::{AsRawFd, FromRawFd}; use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; /// The result of 'find-paths' pre-serialization: A filename relative to some parent inode. pub(in crate::passthrough) struct InodePath { pub parent: StrongInodeReference, pub filename: String, } /// Stores state for constructing serializable data for inodes using the `InodeMigrationInfo::Path` /// variant, in order to prepare for migration. pub(in crate::passthrough::device_state) struct Constructor<'a> { /// Reference to the filesystem for which to reconstruct inodes' paths. fs: &'a PassthroughFs, /// Set to true when we are supposed to cancel cancel: Arc, } impl InodePath { /// Create the migration info for an inode that is collected during the `prepare_serialization` /// phase pub fn new_with_cstr(parent_ref: StrongInodeReference, filename: &CStr) -> io::Result { let utf8_name = filename.to_str().map_err(|err| { other_io_error(format!( "Cannot convert filename into UTF-8: {filename:?}: {err}" )) })?; Ok(InodePath { parent: parent_ref, filename: utf8_name.to_string(), }) } pub(super) fn for_each_strong_reference(self, mut f: F) { f(self.parent); } /// Checker whether the associated inode (`inode_data`) is present under this path, returning /// an error if (and only if) it is not. pub(super) fn check_presence( &self, inode_data: &InodeData, full_info: &InodeMigrationInfo, ) -> io::Result<()> { let filename = CString::new(self.filename.clone())?; let parent_fd = self.parent.get().get_file()?; let st = statx(&parent_fd, Some(&filename))?; if st.st.st_dev != inode_data.ids.dev { return Err(other_io_error(format!( "Device ID differs: Expected {}, found {}", inode_data.ids.dev, st.st.st_dev ))); } // Try to take a file handle from the migration info; if none is there, try to generate it // (but ignore errors, falling back to checking the inode ID). We do really want to check // the file handle if possible, though, to detect inode ID reuse. let (fh, fh_ref) = if let Some(fh_ref) = full_info.file_handle.as_ref() { (None, Some(fh_ref)) } else if let Ok(fh) = SerializableFileHandle::try_from(&inode_data.file_or_handle) { (Some(fh), None) } else { (None, None) }; if let Some(fh) = fh_ref.or(fh.as_ref()) { // If we got a file handle for `inode_data`, failing to get it for `filename` probably // means it is a different inode. Be cautious and return an error then. let actual_fh = FileHandle::from_name_at_fail_hard(&parent_fd, &filename) .err_context(|| "Failed to generate file handle")?; // Ignore mount ID: A file handle can be in two different mount IDs, but as long as it // is on the same device, it is still the same mount ID; and we have already checked // the device ID. fh.require_equal_without_mount_id(&actual_fh.into()) .map_err(other_io_error) } else { // Cannot generate file handle? Fall back to just the inode ID. if st.st.st_ino != inode_data.ids.ino { return Err(other_io_error(format!( "Inode ID differs: Expected {}, found {}", inode_data.ids.ino, st.st.st_ino ))); } Ok(()) } } } impl From for InodeLocation { fn from(path: InodePath) -> Self { InodeLocation::Path(path) } } impl Display for InodePath { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let parent = self.parent.get(); let parent_mig_info_locked = parent.migration_info.lock().unwrap(); if let Some(parent_mig_info) = parent_mig_info_locked.as_ref() { write!(f, "{}/{}", parent_mig_info.location, self.filename) } else { write!(f, "[inode {}]/{}", parent.inode, self.filename) } } } /// The `Constructor` is an `InodeMigrationInfoConstructor` that creates `InodeMigrationInfo` of /// the `InodeMigrationInfo::Path` variant: It recurses through the filesystem (i.e. the shared /// directory), matching up all inodes it finds with our inode store, and thus finds the parent /// directory node and filename for every such inode. impl<'a> Constructor<'a> { pub fn new(fs: &'a PassthroughFs, cancel: Arc) -> Self { Constructor { fs, cancel } } /** * Collect paths for all inodes in our inode store, during preserialization. * * Recurse from the root directory (the shared directory), constructing `InodeMigrationInfo` * data for every inode in the inode store. This may take a long time, which is why it is done * in the preserialization phase. * * Cannot fail: Collecting inodes’ migration info is supposed to be a best-effort operation. * We can leave any and even all inodes’ migration info empty, then serialize them as invalid * inodes, and let the destination decide what to do based on its --migration-on-error setting. */ pub fn execute(self) { // Only need to do something if we have a root node to recurse from; otherwise the // filesystem is not mounted and we do not need to do anything. if let Ok(root) = self.fs.inodes.get_strong(fuse::ROOT_ID) { self.recurse_from(root); } } /// Recurse from the given directory inode fn recurse_from(&self, root_ref: StrongInodeReference) { let mut dir_buf = vec![0u8; 1024]; // We don't actually use recursion (to not exhaust the stack), but keep a list of // directories we still need to visit, and pop from it until it is empty and we're done let mut remaining_dirs = vec![root_ref]; while let Some(inode_ref) = remaining_dirs.pop() { let dirfd = match inode_ref.get().open_file( libc::O_RDONLY | libc::O_NOFOLLOW | libc::O_CLOEXEC, &self.fs.proc_self_fd, ) { Ok(fd) => fd, Err(err) => { let dir_id = inode_ref.get().identify(&self.fs.proc_self_fd); warn!("Failed to recurse into {dir_id}: {err}"); continue; } }; // Read all directory entries, check them for matches in our inode store, and add any // directory to `remaining_dirs` loop { // Safe because we use nothing but this function on the FD let read_dir_result = unsafe { ReadDir::new_no_seek(&dirfd, dir_buf.as_mut()) }; let mut entries = match read_dir_result { Ok(entries) => entries, Err(err) => { let dir_id = inode_ref.get().identify(&self.fs.proc_self_fd); warn!("Failed to read directory entries of {dir_id}: {err}"); break; } }; if entries.remaining() == 0 { break; } while let Some(entry) = entries.next() { if self.cancel.load(Ordering::Relaxed) { return; } match self.discover(&inode_ref, &dirfd, entry.name) { Ok(Some(entry_inode)) => { // Add directories to visit to the list remaining_dirs.push(entry_inode); } Ok(None) => (), Err(err) => { let dir_id = inode_ref.get().identify(&self.fs.proc_self_fd); let name = entry.name.to_string_lossy(); warn!("Failed to discover entry {name} of {dir_id}: {err}"); } } } } } } /// Check the given directory entry (parent + name) for matches in our inode store. If we find /// any corresponding `InodeData` there, its `.migration_info` is set accordingly. /// For all directories (and directories only), return a strong reference to an inode in our /// store that can be used to recurse further. fn discover( &self, parent_reference: &StrongInodeReference, parent_fd: &F, name: &CStr, ) -> io::Result> { let utf8_name = name.to_str().map_err(|err| { other_io_error(format!( "Cannot convert filename into UTF-8: {name:?}: {err}", )) })?; // Ignore these if utf8_name == "." || utf8_name == ".." { return Ok(None); } let path_fd = { let fd = self .fs .open_relative_to(parent_fd, name, libc::O_PATH, None)?; unsafe { File::from_raw_fd(fd) } }; let stat = statx(&path_fd, None)?; let handle = self.fs.get_file_handle_opt(&path_fd, &stat)?; let ids = InodeIds { ino: stat.st.st_ino, dev: stat.st.st_dev, mnt_id: stat.mnt_id, }; let is_directory = stat.st.st_mode & libc::S_IFMT == libc::S_IFDIR; if let Ok(inode_ref) = self.fs.inodes.claim_inode(handle.as_ref(), &ids) { let mig_info = InodeMigrationInfo::new_internal( &self.fs.cfg, InodePath { parent: StrongInodeReference::clone(parent_reference), filename: utf8_name.to_string(), }, || { Ok(match &handle { Some(h) => h.into(), None => FileHandle::from_fd_fail_hard(&path_fd)?.into(), }) }, )?; *inode_ref.get().migration_info.lock().unwrap() = Some(mig_info); return Ok(is_directory.then_some(inode_ref)); } // We did not find a matching entry in our inode store. In case of non-directories, we are // done. if !is_directory { return Ok(None); } // However, in case of directories, we must create an entry, so we can return it. // (Our inode store may still have matching entries recursively downwards from this // directory. Because every node is serialized referencing its parent, this directory // inode may end up being recursively referenced this way, we don't know yet. // In case there is no such entry, the refcount will eventually return to 0 before // `Self::execute()` returns, dropping it from the inode store again, so it will not // actually end up being serialized.) let file_or_handle = if let Some(h) = handle.as_ref() { FileOrHandle::Handle(self.fs.make_file_handle_openable(h)?) } else { FileOrHandle::File(path_fd) }; let mig_info = InodeMigrationInfo::new_internal( &self.fs.cfg, InodePath { parent: StrongInodeReference::clone(parent_reference), filename: utf8_name.to_string(), }, || (&file_or_handle).try_into(), )?; let new_inode = InodeData { inode: self.fs.next_inode.fetch_add(1, Ordering::Relaxed), file_or_handle, refcount: AtomicU64::new(1), ids, mode: stat.st.st_mode, migration_info: Mutex::new(Some(mig_info)), }; Ok(Some(self.fs.inodes.get_or_insert(new_inode)?)) } } virtiofsd-1.13.0/src/passthrough/device_state/preserialization/mod.rs000064400000000000000000000154261046102023000242350ustar 00000000000000// Copyright 2024 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use crate::passthrough::file_handle::{FileOrHandle, SerializableFileHandle}; use crate::passthrough::inode_store::{InodeData, StrongInodeReference}; use crate::passthrough::{self, MigrationMode}; use crate::util::ResultErrorContext; use std::convert::TryInto; use std::ffi::CStr; use std::fmt::{self, Display}; use std::io; pub mod file_handles; pub mod find_paths; pub mod proc_paths; /// Precursor to `serialized::Inode` that is constructed while serialization is being prepared, and /// will then be transformed into the latter at the time of serialization. To be stored in the /// inode store, alongside each inode (i.e. in its `InodeData`). Constructing this is costly, so /// should only be done when necessary, i.e. when actually preparing for migration. pub(in crate::passthrough) struct InodeMigrationInfo { /// Location of the inode (how the destination can find it) pub location: InodeLocation, /// The inode's file handle. The destination is not supposed to open this handle, but instead /// compare it against the one from the inode it has opened based on `location`. pub file_handle: Option, } pub(in crate::passthrough) enum InodeLocation { /// The root node: No information is stored, the destination is supposed to find this on its /// own (as configured by the user) RootNode, /// Inode is represented by its parent directory and its filename therein, allowing the /// destination to `openat(2)` it Path(find_paths::InodePath), /// Inode is represented by its file handle FileHandle(file_handles::FileHandle), } /// Precursor to `SerializableHandleRepresentation` that is constructed while serialization is /// being prepared, and will then be transformed into the latter at the time of serialization. /// To be stored in the `handles` map, alongside each handle (i.e. in its `HandleData`). /// Constructing this is cheap, so can be done whenever any handle is created. pub(in crate::passthrough) enum HandleMigrationInfo { /// Handle can be opened by opening its associated inode with the given `open(2)` flags OpenInode { flags: i32 }, } impl InodeMigrationInfo { /// General function for public use that creates the correct `InodeLocation` variant based on /// the `migration_mode` setting pub fn new( fs_cfg: &passthrough::Config, parent_ref: StrongInodeReference, filename: &CStr, file_or_handle: &FileOrHandle, ) -> io::Result { let location: InodeLocation = match fs_cfg.migration_mode { MigrationMode::FindPaths => { find_paths::InodePath::new_with_cstr(parent_ref, filename)?.into() } MigrationMode::FileHandles => { let handle = file_or_handle.try_into().err_context(|| { format!( "(inode {})/{:?}: Failed to generate file handle", parent_ref.get().inode, filename, ) })?; file_handles::FileHandle::new(handle).into() } }; Self::new_internal(fs_cfg, location, || file_or_handle.try_into()) } /// Internal `new` function that takes the actually constituting elements of the struct fn new_internal, F: FnOnce() -> io::Result>( fs_cfg: &passthrough::Config, inode_location: L, file_handle_fn: F, ) -> io::Result { let file_handle: Option = if fs_cfg.migration_verify_handles { Some(file_handle_fn()?) } else { None }; Ok(InodeMigrationInfo { location: inode_location.into(), file_handle, }) } /// Use this for the root node. That node is special in that the destination gets no /// information on how to find it, because that is configured by the user. pub(in crate::passthrough) fn new_root( fs_cfg: &passthrough::Config, file_or_handle: &FileOrHandle, ) -> io::Result { Self::new_internal(fs_cfg, InodeLocation::RootNode, || { file_or_handle.try_into() }) } /// Call the given function for each `StrongInodeReference` contained in this /// `InodeMigrationInfo` pub fn for_each_strong_reference(self, f: F) { match self.location { InodeLocation::RootNode => (), InodeLocation::Path(p) => p.for_each_strong_reference(f), InodeLocation::FileHandle(fh) => fh.for_each_strong_reference(f), } } /** * Return `true` if this migration info contains a path. * * If so, when the associated inode’s path is modified or invalidated (e.g. renamed, moved, * unlinked), its migration info must then be updated or invalidated accordingly. */ pub fn has_path(&self) -> bool { // Use `match` instead of `matches!()` so we don’t forget any potential future variants match &self.location { InodeLocation::RootNode => false, InodeLocation::Path(_) => true, InodeLocation::FileHandle(_) => false, } } /** * Assuming this migration info contains a path, check whether the associated inode (given * through `inode_data`) is indeed present under that path, returning an error if (and only if) * it is not. * * Always return `Ok(())` if this migration info’s location is not defined by a path. */ pub fn check_path_presence(&self, inode_data: &InodeData) -> io::Result<()> { match &self.location { InodeLocation::RootNode => Ok(()), InodeLocation::Path(p) => p.check_presence(inode_data, self), InodeLocation::FileHandle(_) => Ok(()), } } } impl Display for InodeLocation { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { InodeLocation::RootNode => write!(f, "[shared directory root]"), InodeLocation::Path(p) => write!(f, "{p}"), InodeLocation::FileHandle(fh) => write!(f, "{fh}"), } } } impl HandleMigrationInfo { /// Create the migration info for a handle that will be required when serializing pub fn new(flags: i32) -> Self { HandleMigrationInfo::OpenInode { // Remove flags that make sense when the file is first opened by the guest, but which // we should not set when continuing to use the file after migration because they would // e.g. modify the file flags: flags & !(libc::O_CREAT | libc::O_EXCL | libc::O_TRUNC), } } } virtiofsd-1.13.0/src/passthrough/device_state/preserialization/proc_paths.rs000064400000000000000000000477271046102023000256310ustar 00000000000000// Copyright 2024 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. /*! * Facilities for getting inodes’ paths from /proc/self/fd for migration. * * This module provides different objects that all share the same core for multiple purposes: * - Provide a preserialization migration info constructor for the find-paths migration mode * - Check migration info paths during migration and, if found incorrect, reconstruct them as we * would for preserialization; this is used by --migration-confirm-paths, as well as an implicit * double-check step after any path-based preserialization phase */ use super::InodeMigrationInfo; use crate::fuse; use crate::passthrough::inode_store::{InodeData, InodePathError, StrongInodeReference}; use crate::passthrough::stat::statx; use crate::passthrough::util::{relative_path, FdPathError}; use crate::passthrough::PassthroughFs; use crate::util::{other_io_error, ErrorContext}; use std::ffi::{CStr, CString}; use std::io; use std::path::Path; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; /** * Provides all core functionality. * * This module provides functionality for three different cases; all of it is implemented on this * single internal struct that is incorporated into different public structs depending on the use. * * `Walker::run()` is the core method, which walks over the inode store and can check paths in * inode migration info structures, and construct them by looking into /proc/self/fd. What exactly * is done depends on `mode`. */ struct Walker<'a> { /// Reference to the filesystem state to check fs: &'a PassthroughFs, /// Specifies which functionality we are supposed to provide #[allow(dead_code)] // will be used once we provide more than one mode mode: Mode, /// Optional: Cancel early cancel: Option>, } /** * Construct paths during preserialization. * * Give all inodes that don’t have a migration info set a path from /proc/self/fd. */ pub(in crate::passthrough::device_state) struct Constructor<'a> { /// `Walker` in `Mode::Constructor` mode. walker: Walker<'a>, } /** * `--migration-confirm-paths` implementation. * * Implements checking inodes’ paths right before serialization, as requested by the user through * the `--migration-confirm-paths` switch: Give all inodes that either don’t have a migration info * set, or where it is found to be incorrect, a path from /proc/self/fd. Furthermore, given the * user has specifically requested this check run, return any error as a hard error, preventing * migration. */ pub(in crate::passthrough::device_state) struct ConfirmPaths<'a> { /// `Walker` in `Mode::ConfirmPaths` mode. walker: Walker<'a>, } /** * Double-check inodes’ paths after preserialization. * * Similar to `ConfirmPaths`, but is an implicit double-check run after the first preserialization * phase, and as a result, is more relaxed: * - On a fundamental unrecoverable error (e.g. failing to find the shared directory’s base path), * printing a warning an skipping the whole run is OK * - We only need to find new paths for inodes that have a path in their migration info when we * found that path to be incorrect. No need to try to find paths for inodes that don’t have any * migration info attached to them. */ pub(in crate::passthrough::device_state) struct ImplicitPathCheck<'a> { /// `Walker` in `Mode::ImplicitPathCheck` mode. walker: Walker<'a>, } /// Selects how a `Walker` should behave. pub(in crate::passthrough::device_state) enum Mode { /// Collect inodes’s paths during preserialization. Constructor, /// Run the `--migration-confirm-paths` check. ConfirmPaths, /// Double-check inodes’ paths after preserialization. ImplicitPathCheck, } /** * Error type to enable `--migration-mode=find-paths` fall-back functionality. * * `--migration-mode=find-paths` first tries to get inodes’ paths from /proc/self/fd. That * implementation is provided by this module. If that fails, this error allows distinguishing * between: * - errors that may not happen when using another method of finding inodes’ paths (e.g. * exhaustive iteration of everything inside of the shared directory), and * - errors that would probably happen regardless. * * That is, if encountering errors of the former type, we should fall back to the other method * (provided by [`super::find_paths`]). */ pub(in crate::passthrough) enum WrappedError { /// A different preserialization method might be able to find this path. Fallback(io::Error), /// Unrecoverable error, falling back probably won’t change anything. Unrecoverable(io::Error), } impl<'a> Constructor<'a> { /// Prepare to collect paths for `fs`. pub fn new(fs: &'a PassthroughFs, cancel: Arc) -> Self { Constructor { walker: Walker::new(fs, Mode::Constructor, Some(cancel)), } } /** * Collect paths for all inodes in our inode store, during preserialization. * * Look through all inodes in our inode store, try to get their paths from /proc/self/fd, * constructing `InodeMigrationInfo` data for them. May take some time, so is done during the * pre-serialization phase of migration. * * Cannot fail: Collecting inodes’ migration info is supposed to be a best-effort operation. * We can leave any and even all inodes’ migration info empty, then serialize them as invalid * inodes, and let the destination decide what to do based on its `--migration-on-error` * setting. * * However, it is possible that we find inodes whose paths we failed to get from /proc/self/fd, * but believe they probably do have a valid path inside the shared directory anyway (which the * kernel just failed to report); in this case, return `true` so the caller can decide to fall * back to the [`super::find_paths`] implementation. */ pub fn execute(self) -> bool { match self.walker.run() { Ok(()) => false, Err(WrappedError::Fallback(err)) => { warn!("Failed to construct inode paths: {err}"); true } // Unrecoverable error where not even falling back makes sense should be a rare // occurrence Err(WrappedError::Unrecoverable(err)) => { error!("Failed to construct inode paths: {err}; may be unable to migrate"); false } } } } impl<'a> ConfirmPaths<'a> { /// Prepare to confirm paths collected for `fs`. pub fn new(fs: &'a PassthroughFs) -> Self { ConfirmPaths { walker: Walker::new(fs, Mode::ConfirmPaths, None), } } /** * Run the `--migration-confirm-paths` check. * * If necessary, try to fix the paths collected during the preserialization phase by looking * into /proc/self/fd. Return errors. */ pub fn confirm_paths(self) -> io::Result<()> { // There is no fallback in `ConfirmPaths` mode, treat all errors the same way self.walker.run().map_err(WrappedError::into_inner) } } impl<'a> ImplicitPathCheck<'a> { /// Prepare to double-check paths during preserialization. pub fn new(fs: &'a PassthroughFs, cancel: Arc) -> Self { ImplicitPathCheck { walker: Walker::new(fs, Mode::ImplicitPathCheck, Some(cancel)), } } /** * Double-check inodes’ paths after preserialization. * * Try to fix any paths that are wrong (by getting new paths from /proc/self/fd), but do not * return errors: This check is implicit, not requested by the user, so should be infallible, * not cancelling migration on error. */ pub fn check_paths(self) { if let Err(err) = self.walker.run() { // There is no fallback in `ImplicitPathCheck` mode, treat all errors the same way let err = err.into_inner(); warn!("Double-check of all inode paths collected for migration failed: {err}") } } } impl<'a> Walker<'a> { /** * Create a `Walker` over `fs` with the given `mode`. * * If `cancel` is given, the operation will be cancelled when it is found to be set. */ fn new(fs: &'a PassthroughFs, mode: Mode, cancel: Option>) -> Self { Walker { fs, mode, cancel } } /** * Run the `Walker` over all inodes in our store. * * Iterate through the store, check the paths we found (depending on the `mode`), and update * inodes’ migration info with paths from /proc/self/fd (depending on the `mode`). * * In case of error, differentiate between: * - `Fallback(err)`: We failed to construct inode migration info for some number of inodes. * However, we expect a different, more exhaustive method to find inodes’ * paths (e.g. DFS through the shared directory) can succeed. In case of * `Mode::Constructor`, the caller must fall back to such a different * preserialization module (i.e. [`super::find_paths`]). In other modes, * this should be treated the same as `Unrecoverable`. * - `Unrecoverable(err)`: Hard error, falling back to a different method is not advised. */ fn run(self) -> Result<(), WrappedError> { let Some(root_node) = self.fs.inodes.get(fuse::ROOT_ID) else { // No root? That’s fine if and only if we don’t have any inodes at all. return if self.fs.inodes.is_empty() { Ok(()) } else { // Should never happen, consider this error unrecoverable Err(WrappedError::Unrecoverable(other_io_error( "Root node not found", ))) }; }; // It’s possible we fail to get the root node’s path, but we can’t continue then. Advise // to fall back on error. let shared_dir_path = root_node.get_path(&self.fs.proc_self_fd).map_err(|err| { WrappedError::Fallback( io::Error::from(err).context("Failed to get shared directory's path"), ) })?; for inode_data in self.fs.inodes.iter() { if self .cancel .as_ref() .map(|c| c.load(Ordering::Relaxed)) .unwrap_or(false) { break; } if !self.should_update_inode(&inode_data) { continue; } let set_path_result = set_path_migration_info_from_proc_self_fd(&inode_data, self.fs, &shared_dir_path); match self.mode { // For preserialization, finding inodes is not worth a notification. For errors, // we distinguish between errors for which we advise our caller to fall back to a // different preserialization methods, and once where we do not. The latter we // just log, the former we return to the caller immediately. They must then fall // back to an exhaustive method, so aborting early is OK. Mode::Constructor => match set_path_result { Ok(()) => (), Err(WrappedError::Fallback(err)) => return Err(WrappedError::Fallback(err)), Err(WrappedError::Unrecoverable(err)) => { error!("Inode {}: {}", inode_data.inode, err) } }, // In check modes, we note inodes we found, and log all kinds of errors // indiscriminately. Mode::ConfirmPaths | Mode::ImplicitPathCheck => { if let Err(err) = set_path_result { error!("Inode {}: {}", inode_data.inode, err.into_inner()); } else if let Some(new_info) = inode_data.migration_info.lock().unwrap().as_ref() { info!("Found inode {}: {}", inode_data.inode, new_info.location); } } } } Ok(()) } /** * Check the given inode’s migration info. * * - Return `true` iff the info should be updated from /proc/self/fd. * - Return `false` iff the info seems fine, and should be left as-is. */ fn should_update_inode(&self, inode_data: &InodeData) -> bool { let mut migration_info_locked = inode_data.migration_info.lock().unwrap(); match (&self.mode, migration_info_locked.as_ref()) { // Do not touch inodes: // - Without migration info in the implicit/lax check mode // - When we are supposed to collect migration info, not check/update it, i.e. during // preserialization, and the inode already has migration info (Mode::ImplicitPathCheck, None) | (Mode::Constructor, Some(_)) => false, // In both the explicit check mode and the preserialization constructor, give migration // info to those inodes that don’t already have it (Mode::ConfirmPaths, None) | (Mode::Constructor, None) => true, // In both check modes, when there is pre-existing migration info, we have to check its // path; update those we find to be incorrect (Mode::ConfirmPaths, Some(migration_info)) | (Mode::ImplicitPathCheck, Some(migration_info)) => { if let Err(err) = migration_info.check_path_presence(inode_data) { // Migration info is wrong, clear it unconditionally, regardless of whether we // can find a better one let migration_info = migration_info_locked.take().unwrap(); warn!( "Lost inode {} (former location: {}): {}; looking it up through /proc/self/fd", inode_data.inode, migration_info.location, err ); true } else { false } } } } } /// Return an inode’s link count, if available. fn link_count(inode_data: &InodeData) -> Option { inode_data .get_file() .ok() .and_then(|f| statx(&f, None).ok()) .map(|stat| stat.st.st_nlink) } /** * Update inode migration info from /proc/self/fd. * * Fetch the given inode’s path from /proc/self/fd, split that path into components relative to * the shared directory root, and for all inodes along that path, if they don’t have a migration * info set, set it accordingly. * * Note that this is decidedly not a method of `Walker` so that we can easily reuse it in other * places; specifically, to re-establish a path for inodes that have been potentially invalidated. */ pub(in crate::passthrough) fn set_path_migration_info_from_proc_self_fd( inode_data: &InodeData, fs: &PassthroughFs, shared_dir_path: &CStr, ) -> Result<(), WrappedError> { let abs_path_result = inode_data.get_path(&fs.proc_self_fd); let Ok(abs_path) = abs_path_result else { let err = abs_path_result.unwrap_err(); // In case of `Mode::Constructor`, depending on the exact kind of error, figure out whether // it makes sense to fall back to a different method of finding inodes’ paths let fall_back = match &err { // If the kernel reports this inode to be deleted even though it has a link somewhere, // fall back and try to find that link’s path InodePathError::FdPathError(FdPathError::Deleted(_)) => { link_count(inode_data).map(|n| n > 0).unwrap_or(false) } // If the kernel reports this inode under a path outside of the shared directory but it // has multiple links, one of those might be inside of the shared directory, so fall // back and try to find it InodePathError::OutsideRoot => link_count(inode_data).map(|n| n > 1).unwrap_or(false), // Very general problem, should not happen, so consider this unrecoverable InodePathError::NoFd(_) => false, // Consider all other internal errors from getting the path from /proc/self/fd to be // problems pertaining specifically to this method of obtaining paths, i.e. mark them // as `Fallback` errors InodePathError::FdPathError(_) => true, }; let err = io::Error::from(err).context("Failed to get path from /proc/self/fd"); return if fall_back { Err(WrappedError::Fallback(err)) } else { Err(WrappedError::Unrecoverable(err)) }; }; let rel_path = relative_path(&abs_path, shared_dir_path) .map_err(|err| { // Same as `OutsideRoot` above if link_count(inode_data).map(|n| n > 1).unwrap_or(false) { WrappedError::Fallback(err) } else { WrappedError::Unrecoverable(err) } })? .to_str() .map_err(|err| { // Non UTF-8 path names are unrecoverable WrappedError::Unrecoverable(other_io_error(format!( "Path {abs_path:?} is not a UTF-8 string: {err}" ))) })? .to_string(); let path = Path::new(&rel_path); // Getting the root node should always succeed; if it doesn’t, everything is broken anyway and // falling back will not fix it. let mut parent = fs .inodes .get_strong(fuse::ROOT_ID) .map_err(WrappedError::Unrecoverable)?; for element in path { // Both `unwrap()`s must succeed: We know the path is UTF-8, and we know it does not // contain internal NULs (because it used to be a CString before) let element_cstr = CString::new(element.to_str().unwrap()).unwrap(); // This look-up automatically sets the inode migration data on this inode. // If we fail the look-up (i.e. fail to traverse the path), other migration methods are // unlikely to succeed either, so consider errors here unrecoverable. let entry = fs .do_lookup(parent.get().inode, &element_cstr) .map_err(WrappedError::Unrecoverable)?; // `entry.inode` is effectively a strong reference, so this must succeed let entry_data = fs.inodes.get(entry.inode).unwrap(); // Safe: Turns `entry.inode` back into a typed strong reference let entry_inode = unsafe { StrongInodeReference::new_no_increment(entry_data, &fs.inodes) }; { let entry_data = entry_inode.get(); let mut mig_info = entry_data.migration_info.lock().unwrap(); if mig_info.is_none() { // If we fail to set the migration info while traversing the path, other // preserialization methods will likely encounter the same problem. Unrecoverable // error. *mig_info = Some( InodeMigrationInfo::new( &fs.cfg, parent, &element_cstr, &entry_data.file_or_handle, ) .map_err(WrappedError::Unrecoverable)?, ); } } parent = entry_inode; } if parent.get().inode != inode_data.inode { // For some reason, we failed to end up on the inode where we wanted to end up. Maybe // another preserialization method would have more luck? Advise to fall back. return Err(WrappedError::Fallback(other_io_error(format!( "Inode not found under path reported by /proc/self/fd ({rel_path:?})" )))); } Ok(()) } impl WrappedError { /// Return the contained `io::Error`, discarding the fall-back advice. pub fn into_inner(self) -> io::Error { match self { WrappedError::Fallback(err) => err, WrappedError::Unrecoverable(err) => err, } } } virtiofsd-1.13.0/src/passthrough/device_state/serialization.rs000064400000000000000000000322501046102023000227410ustar 00000000000000// Copyright 2024 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. /*! * Serialization functionality (i.e. what happens in `SerializableFileSystem::serialize()`): Take * information that we have collected during preserialization and turn it into actually * serializable structs ('serialized' module), which are then turned into a plain vector of bytes. */ use crate::fuse; use crate::passthrough::device_state::preserialization::{ self, HandleMigrationInfo, InodeMigrationInfo, }; use crate::passthrough::device_state::serialized; use crate::passthrough::inode_store::InodeData; use crate::passthrough::mount_fd::MountFds; use crate::passthrough::util::relative_path; use crate::passthrough::{Handle, HandleData, MigrationMode, PassthroughFs}; use crate::util::{other_io_error, ResultErrorContext}; use std::collections::{HashMap, HashSet}; use std::convert::TryFrom; use std::ffi::CString; use std::io; use std::sync::atomic::Ordering; /** * Helper structure to generate the mount FD map. * * The mount FD map maps the source’s mount IDs to paths in the shared directory, which the * destination instance can open as mount FDs to make use of file handles created on the source. */ struct MountPathsBuilder<'a> { /// Reference to [`PassthroughFs.mount_fds`](`PassthroughFs#structfield.mount_fds`) mount_fds: &'a MountFds, /// Path of the shared directory shared_dir_path: CString, } impl TryFrom for Vec { type Error = io::Error; /// Root of serialization: Turn the final `serialized::PassthroughFs` struct into plain bytes fn try_from(state: serialized::PassthroughFs) -> io::Result { postcard::to_stdvec(&state).map_err(other_io_error) } } impl From<&PassthroughFs> for serialized::PassthroughFsV2 { /// Serialize `fs`, assuming it has been prepared for serialization (i.e. all inodes must have /// their migration info set) fn from(fs: &PassthroughFs) -> Self { let handles_map = fs.handles.read().unwrap(); let inodes: Vec = fs.inodes.iter().map(|inode| { inode .as_ref() .as_serialized(fs) .unwrap_or_else(|err| { warn!( "Failed to serialize inode {} (st_dev={}, mnt_id={}, st_ino={}): {}; marking as invalid", inode.inode, inode.ids.dev, inode.ids.mnt_id, inode.ids.ino, err ); serialized::Inode { id: inode.inode, refcount: inode.refcount.load(Ordering::Relaxed), location: serialized::InodeLocation::Invalid, file_handle: None, } }) }).collect(); let mount_paths = if fs.cfg.migration_mode == MigrationMode::FileHandles { match MountPathsBuilder::new(fs) { Ok(mpb) => mpb.build(inodes.iter()), Err(err) => { warn!( "Cannot collect mount points: {err}; will not be able to migrate any inodes" ); HashMap::new() } } } else { // No need for mount paths outside of file-handles migration mode HashMap::new() }; let handles = handles_map .iter() .map(|(handle, data)| (*handle, data.as_ref()).into()) .collect(); serialized::PassthroughFsV2 { v1: serialized::PassthroughFsV1 { inodes, next_inode: fs.next_inode.load(Ordering::Relaxed), handles, next_handle: fs.next_handle.load(Ordering::Relaxed), negotiated_opts: fs.into(), }, mount_paths, } } } impl From<&PassthroughFs> for serialized::NegotiatedOpts { /// Serialize the options we have negotiated with the guest fn from(fs: &PassthroughFs) -> Self { serialized::NegotiatedOpts { writeback: fs.writeback.load(Ordering::Relaxed), announce_submounts: fs.announce_submounts.load(Ordering::Relaxed), posix_acl: fs.posix_acl.load(Ordering::Relaxed), sup_group_extension: fs.sup_group_extension.load(Ordering::Relaxed), } } } impl InodeData { /// Serialize an inode, which requires that its `migration_info` is set fn as_serialized(&self, fs: &PassthroughFs) -> io::Result { let id = self.inode; let refcount = self.refcount.load(Ordering::Relaxed); // Note that we do not special-case invalid inodes here (`self.file_or_handle == // FileOrHandle::Invalid(_)`), i.e. inodes that this instance failed to find on a prior // incoming migration. We do not expect them to have migration info (we could not open // them, so we should not know where to find them), but if we do, there must be a reason // for it, so we might as well forward it to our destination. let migration_info_locked = self.migration_info.lock().unwrap(); let migration_info = migration_info_locked .as_ref() .ok_or_else(|| other_io_error("Failed to reconstruct inode location"))?; // The root node (and only the root node) must have its special kind of placeholder info assert_eq!( (id == fuse::ROOT_ID), matches!( migration_info.location, preserialization::InodeLocation::RootNode ) ); // Serialize the information that tells the destination how to find this inode let location = migration_info.as_serialized()?; let file_handle = if fs.cfg.migration_verify_handles { // We could construct the file handle now, but we don't want to do I/O here. It should // have been prepared in the preserialization phase. If it is not, that's an internal // programming error. let handle = migration_info .file_handle .as_ref() .ok_or_else(|| other_io_error("No prepared file handle found"))?; Some(handle.clone()) } else { None }; Ok(serialized::Inode { id, refcount, location, file_handle, }) } } impl InodeMigrationInfo { /// Helper for serializing inodes: Turn their prepared `migration_info` into a /// `serialized::InodeLocation` fn as_serialized(&self) -> io::Result { Ok(match &self.location { preserialization::InodeLocation::RootNode => serialized::InodeLocation::RootNode, preserialization::InodeLocation::Path(preserialization::find_paths::InodePath { parent, filename, }) => { // Safe: We serialize everything before we will drop the serialized state (the // inode store), so the strong refcount in there will outlive this weak reference // (which means that the ID we get will remain valid until everything is // serialized, i.e. that parent node will be part of the serialized state) let parent = unsafe { parent.get_raw() }; let filename = filename.clone(); serialized::InodeLocation::Path { parent, filename } } preserialization::InodeLocation::FileHandle( preserialization::file_handles::FileHandle { handle }, ) => serialized::InodeLocation::FileHandle { handle: handle.clone(), }, }) } } impl From<(Handle, &HandleData)> for serialized::Handle { /// Serialize a handle fn from(handle: (Handle, &HandleData)) -> Self { // Note that we will happily process invalid handles here (`handle.1.file == // HandleDataFile::Invalid(_)`), i.e. handles that this instance failed to open on a prior // incoming migration. A handle is identified by the inode to which it belongs, and // instructions on how to open that inode (e.g. `open()` flags). If this instance failed // to open the inode in this way (on in-migration), that does not prevent us from // forwarding the same information to the next destination (on out-migration), and thus // allow it to re-try. let source = (&handle.1.migration_info).into(); serialized::Handle { id: handle.0, inode: handle.1.inode, source, } } } impl From<&HandleMigrationInfo> for serialized::HandleSource { /// Helper for serializing handles: Turn their prepared `migration_info` into a /// `serialized::HandleSource` fn from(repr: &HandleMigrationInfo) -> Self { match repr { HandleMigrationInfo::OpenInode { flags } => { serialized::HandleSource::OpenInode { flags: *flags } } } } } impl<'a> MountPathsBuilder<'a> { /** * Create a new `MountPathsBuilder` for `fs`. * * `fs` is needed to: * - get the shared directory’s (root node’s) path, * - get a reference to [`fs.mount_fds`](PassthroughFs#structfield.mount_fds), which is * basically the map we want to serialize (except it maps to FDs, and we want to map to * paths). */ fn new(fs: &'a PassthroughFs) -> io::Result { // No reason to use `MountPathsBuilder` in any other migration mode assert!(fs.cfg.migration_mode == MigrationMode::FileHandles); // With the migration mode is set to "file-handles", `PassthroughFs::new()` is expected to // create `mount_fds`, so it should be present let Some(mount_fds) = fs.mount_fds.as_ref() else { return Err(other_io_error("No mount FD map found")); }; let Some(root_node) = fs.inodes.get(fuse::ROOT_ID) else { if fs.inodes.is_empty() { // No inodes at all, the FS is probably not mounted. There will not be any // serialized inodes, so `build()` will have nothing to do, and we can just keep // `shared_dir_path` empty. return Ok(MountPathsBuilder { mount_fds, shared_dir_path: CString::new("").unwrap(), }); } else { return Err(other_io_error( "Root node (shared directory) not in inode store", )); } }; let shared_dir_path = root_node .get_path(&fs.proc_self_fd) .map_err(io::Error::from) .err_context(|| "Failed to get shared directory path")?; Ok(MountPathsBuilder { mount_fds, shared_dir_path: shared_dir_path.to_owned(), }) } /** * Internal use: Get `mnt_id`’s path in the shared directory. * * Return the path of an inode relative to the shared directory that is on the mount `mnt_id`. */ fn get_mount_path(&mut self, mnt_id: u64) -> io::Result { let path = self .mount_fds .get_mount_root(mnt_id) .map_err(other_io_error)?; // Clone `path` so we can still use it in the error message let c_path = CString::new(path.clone()) .map_err(|_| other_io_error(format!("Cannot convert path ({path}) to C string")))?; let c_relative_path = match relative_path(&c_path, &self.shared_dir_path) { Ok(rp) => rp, // Error means the path is outside of the shared directory. Return the shared // directory itself, then. Err(_) => return Ok(".".to_string()), }; let relative_path = c_relative_path.to_str().map_err(|_| { other_io_error(format!( "Path {c_relative_path:?} cannot be converted to UTF-8" )) })?; if relative_path.is_empty() { Ok(".".to_string()) } else { Ok(relative_path.to_string()) } } /** * Given an iterator over all serialized inodes, construct the map. * * Iterate over all serialized inodes, and create a mount path map that has an entry for every * mount ID referenced by any file handle in any of the serialized inodes. */ fn build<'b, I: Iterator>( mut self, iter: I, ) -> HashMap { let mount_ids: HashSet = iter .filter_map(|si| match &si.location { serialized::InodeLocation::FileHandle { handle } => Some(handle.mount_id()), _ => None, }) .collect(); let mut map = HashMap::new(); for mount_id in mount_ids { match self.get_mount_path(mount_id) { Ok(path) => { map.insert(mount_id, path); } Err(err) => warn!( "Failed to get mount ID {mount_id}'s root: {err}; \ will not be able to migrate inodes on this filesystem" ), } } map } } virtiofsd-1.13.0/src/passthrough/device_state/serialized.rs000064400000000000000000000131041046102023000222140ustar 00000000000000// Copyright 2024 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. /*! * Structs and enums that constitute our serialized state "on the wire". Turning them into/from * plain bytes still needs to be done with some serde implementation. */ use crate::passthrough::file_handle::SerializableFileHandle; use crate::passthrough::inode_store::Inode as InodeId; use crate::passthrough::Handle as HandleId; use serde::{Deserialize, Serialize}; use std::collections::HashMap; /// Full serialized device state (for `PassthroughFs`). This is an enum so in case incompatible /// changes have to be made, new version variants can be added while still being able to migrate /// from older versions. #[derive(Debug, Deserialize, Serialize)] pub(super) enum PassthroughFs { /// Initial version V1(PassthroughFsV1), /// Version with mount point paths (for migrating file handles) V2(PassthroughFsV2), } /// v1 of our serialized migration stream. #[derive(Debug, Deserialize, Serialize)] pub(super) struct PassthroughFsV1 { /// List of all looked up inodes pub(super) inodes: Vec, /// Next free index for inode IDs pub(super) next_inode: u64, /// List of all open files (handles) pub(super) handles: Vec, /// Next free index for handle IDs pub(super) next_handle: u64, /// Remember which options have been negotiated during INIT pub(super) negotiated_opts: NegotiatedOpts, } /// v2 of our serialized migration stream. #[derive(Debug, Deserialize, Serialize)] pub(super) struct PassthroughFsV2 { /// Base class: All v1 fields pub(super) v1: PassthroughFsV1, /** * Map of mount IDs to paths in the shared directory. * * When migrating using file handles, we need to translate the source’s mount IDs (only valid * there) into paths inside the shared directory so the destination can generate mount FDs for * file handles created by the source, to pass them to `open_by_handle_at()`. */ pub(super) mount_paths: HashMap, } /// Options that can be negotiated during INIT, i.e. ones for which we must remember whether we /// have enabled them after negotiating with the guest #[derive(Debug, Deserialize, Serialize)] pub(super) struct NegotiatedOpts { pub(super) writeback: bool, pub(super) announce_submounts: bool, pub(super) posix_acl: bool, pub(super) sup_group_extension: bool, } /// Serializable data for an inode that has been looked up #[derive(Debug, Deserialize, Serialize)] pub(super) struct Inode { /// Own inode ID pub(super) id: InodeId, /// Current refcount pub(super) refcount: u64, /// Description of this inode that allows the destination to find it pub(super) location: InodeLocation, /// Inode file handle. If present, the destination is not supposed to open this file handle, /// but instead compare it against the one of the inode it has opened based on `location`. pub(super) file_handle: Option, } /// Serializable description of some inode that allows the destination to find it #[derive(Debug, Deserialize, Serialize)] pub(super) enum InodeLocation { /// The root node is not given a serialized location; the destination is supposed to find it on /// its own RootNode, /// Described by its path: The destination will have to open the given filename Path { /// ID of the parent inode parent: InodeId, /// A filename relative to the parent that allows opening this inode. Note that using /// `String` restricts us to paths that can be represented as UTF-8, which is not /// necessarily a restriction that all operating systems have. However, we need to use /// some common encoding (i.e., cannot use `OsString`), or otherwise we could not migrate /// between operating systems using different string representations. filename: String, }, /// Source has deemed that this inode can no longer be found. The destination needs to decide /// how to proceed (e.g. whether to abort migration or simply remember that this inode is /// invalid and tell the guest so). Invalid, /// Described by its path: The destination will have to open the given filename relative to the /// shared directory (the root node). In contrast to `Path`, there is no strong reference to /// the shared directory node. FullPath { /// Filename relative to the shared directory root node. Stored in UTF-8, just like /// `Path.filename`. filename: String, }, /// Described by its file handle FileHandle { /// File handle, which includes the source system’s mount ID (only valid as a key for the /// [`PassthroughFsV2.mount_paths`](PassthroughFsV2#structfield.mount_paths) map). handle: SerializableFileHandle, }, } /// Serializable representation of an open file (a handle) #[derive(Debug, Deserialize, Serialize)] pub(super) struct Handle { /// Own handle ID pub(super) id: HandleId, /// Inode to which this handle refers pub(super) inode: InodeId, /// Describes where this handle comes from, so the destination can open it pub(super) source: HandleSource, } /// Serializable description of some handle that allows the destination to open it #[derive(Debug, Deserialize, Serialize)] pub(super) enum HandleSource { /// Handle should be opened by opening `Handle.inode` with the `open(2)` flags given here OpenInode { /// Flags passed to `openat(2)` flags: i32, }, } virtiofsd-1.13.0/src/passthrough/file_handle.rs000064400000000000000000000215401046102023000176570ustar 00000000000000// Copyright 2021 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use crate::oslib; use crate::passthrough::mount_fd::{MPRResult, MountFd, MountFds}; use crate::passthrough::stat::MountId; use serde::{Deserialize, Serialize}; use std::convert::{TryFrom, TryInto}; use std::ffi::CStr; use std::fmt::{self, Display}; use std::fs::File; use std::io; use std::os::unix::io::{AsRawFd, RawFd}; use std::sync::Arc; const EMPTY_CSTR: &[u8] = b"\0"; #[derive(Clone, PartialOrd, Ord, PartialEq, Eq)] pub struct FileHandle { mnt_id: MountId, handle: oslib::CFileHandle, } pub struct OpenableFileHandle { handle: FileHandle, mount_fd: Arc, } #[derive(Clone, Debug, Deserialize, Serialize)] pub struct SerializableFileHandle { mnt_id: u64, handle_type: i32, handle: Vec, } pub enum FileOrHandle { File(File), Handle(OpenableFileHandle), // `io::Error` does not implement `Clone`, so without wrapping it in `Arc`, returning the error // anywhere would be impossible without consuming it Invalid(Arc), } impl FileHandle { /// Try to create a file handle for the given file. In contrast to `from_name_at()`, this will /// always return a file handle or an error. pub fn from_name_at_fail_hard(dir: &impl AsRawFd, path: &CStr) -> io::Result { let mut mount_id: libc::c_int = 0; let mut c_fh = oslib::CFileHandle::default(); oslib::name_to_handle_at(dir, path, &mut c_fh, &mut mount_id, libc::AT_EMPTY_PATH)?; Ok(FileHandle { mnt_id: mount_id as MountId, handle: c_fh, }) } /// Create a file handle for the given file. /// /// Return `Ok(None)` if no file handle can be generated for this file: Either because the /// filesystem does not support it, or because it would require a larger file handle than we /// can store. These are not intermittent failures, i.e. if this function returns `Ok(None)` /// for a specific file, it will always return `Ok(None)` for it. Conversely, if this function /// returns `Ok(Some)` at some point, it will never return `Ok(None)` later. /// /// Return an `io::Error` for all other errors. pub fn from_name_at(dir: &impl AsRawFd, path: &CStr) -> io::Result> { match Self::from_name_at_fail_hard(dir, path) { Ok(fh) => Ok(Some(fh)), Err(err) => match err.raw_os_error() { // Filesystem does not support file handles Some(libc::EOPNOTSUPP) => Ok(None), // Handle would need more bytes than `MAX_HANDLE_SZ` Some(libc::EOVERFLOW) => Ok(None), // Other error _ => Err(err), }, } } /// Try to create a file handle for `fd`. In contrast to `from_fd()`, this will always return /// a file handle or an error. pub fn from_fd_fail_hard(fd: &impl AsRawFd) -> io::Result { // Safe because this is a constant value and a valid C string. let empty_path = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; Self::from_name_at_fail_hard(fd, empty_path) } /// Create a file handle for `fd`. /// This is a wrapper around `from_name_at()` and so has the same interface. pub fn from_fd(fd: &impl AsRawFd) -> io::Result> { // Safe because this is a constant value and a valid C string. let empty_path = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; Self::from_name_at(fd, empty_path) } /** * Return an openable copy of the file handle by ensuring that `mount_fds` contains a valid fd * for the mount the file handle is for. * * `reopen_fd` will be invoked to duplicate an `O_PATH` fd with custom `libc::open()` flags. */ pub fn to_openable( &self, mount_fds: &MountFds, reopen_fd: F, ) -> MPRResult where F: FnOnce(RawFd, libc::c_int) -> io::Result, { Ok(OpenableFileHandle { handle: self.clone(), mount_fd: mount_fds.get(self.mnt_id, reopen_fd)?, }) } } impl OpenableFileHandle { pub fn inner(&self) -> &FileHandle { &self.handle } /** * Open a file handle, using our mount FDs hash map. */ pub fn open(&self, flags: libc::c_int) -> io::Result { oslib::open_by_handle_at(self.mount_fd.file(), &self.handle.handle, flags) } } impl SerializableFileHandle { /// Compare `self` against `other`, disregarding the mount ID. Return a more or less /// descriptive error if both handles are not equal. pub fn require_equal_without_mount_id(&self, other: &Self) -> Result<(), String> { if self.handle_type != other.handle_type { Err(format!( "File handle type differs: 0x{:x} != 0x{:x}", self.handle_type, other.handle_type )) } else if self.handle != other.handle { use std::fmt::Write; let mut description = "File handle differs:".to_string(); for b in self.handle.iter() { let _ = write!(&mut description, " {b:02x}"); } description += " !="; for b in other.handle.iter() { let _ = write!(&mut description, " {b:02x}"); } Err(description) } else { Ok(()) } } /// Compare `self` against `other`. Return a more or less descriptive error if both handles /// are not equal. pub fn require_equal(&self, other: &Self) -> Result<(), String> { if self.mnt_id != other.mnt_id { Err(format!( "File handle mount ID differs: {} != {}", self.mnt_id, other.mnt_id )) } else { self.require_equal_without_mount_id(other) } } /// Get the handle data (without its mount ID or type). pub fn as_bytes(&self) -> &[u8] { &self.handle } /// Get the handle type. pub fn handle_type(&self) -> i32 { self.handle_type } /// Get the mount ID for which this handle is valid. pub fn mount_id(&self) -> u64 { self.mnt_id } /** * Convert this handle into an openable handle. * * An openable handle must have a reference to a mount FD, i.e. a file descriptor on the mount * identified by its mount ID. That FD is `mount_fd`. * * (Note that `mount_fd.mount_id()` may differ from `self.mount_id()`. When migrating, we will * receive `SerializableFileHandle`s from the source with mount IDs valid only on the source, * not here. The caller is responsible for passing a fitting `mount_fd` for a mount ID valid * here.) */ pub fn to_openable(&self, mount_fd: Arc) -> io::Result { let c_handle: oslib::CFileHandle = self.try_into()?; Ok(OpenableFileHandle { handle: FileHandle { // Use the mount FD’s mount ID instead of `self.mnt_id`: Serialized handles may // contain mount IDs that aren’t valid on this host. `MountFd` objects’ mount IDs // are always valid on the current host, and because the caller guarantees that // `mount_fd` can be used to open `self`, we can use its mount ID. mnt_id: mount_fd.mount_id(), handle: c_handle, }, mount_fd, }) } } impl From<&FileHandle> for SerializableFileHandle { fn from(fh: &FileHandle) -> SerializableFileHandle { SerializableFileHandle { mnt_id: fh.mnt_id, #[allow(clippy::useless_conversion)] handle_type: fh.handle.handle_type().try_into().unwrap(), handle: fh.handle.as_bytes().into(), } } } impl From for SerializableFileHandle { fn from(fh: FileHandle) -> SerializableFileHandle { (&fh).into() } } impl TryFrom<&FileOrHandle> for SerializableFileHandle { type Error = io::Error; fn try_from(file_or_handle: &FileOrHandle) -> io::Result { match file_or_handle { FileOrHandle::Handle(handle) => Ok(handle.inner().into()), FileOrHandle::File(file) => { FileHandle::from_fd_fail_hard(file).map(SerializableFileHandle::from) } FileOrHandle::Invalid(err) => Err(io::Error::new(err.kind(), Arc::clone(err))), } } } impl Display for SerializableFileHandle { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "mount_id={}, handle_type=0x{:x}, handle=", self.mnt_id, self.handle_type )?; for byte in &self.handle { write!(f, "{byte:02x}")?; } Ok(()) } } virtiofsd-1.13.0/src/passthrough/inode_store.rs000064400000000000000000000660771046102023000177550ustar 00000000000000// Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. use crate::fuse; use crate::passthrough::device_state::preserialization::InodeMigrationInfo; use crate::passthrough::file_handle::{FileHandle, FileOrHandle}; use crate::passthrough::stat::MountId; use crate::passthrough::util::{ ebadf, get_path_by_fd, is_safe_inode, reopen_fd_through_proc, FdPathError, }; use crate::util::other_io_error; use std::collections::BTreeMap; use std::ffi::CString; use std::fs::File; use std::ops::Deref; use std::os::unix::io::{AsRawFd, RawFd}; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex, RwLock}; use std::{fmt, io}; pub type Inode = u64; #[derive(Clone, Copy, Default, Eq, Ord, PartialEq, PartialOrd)] pub(crate) struct InodeIds { pub ino: libc::ino64_t, pub dev: libc::dev_t, pub mnt_id: MountId, } /// Strong reference to some inode in our inode store, which is counted against the /// `InodeData.refcount` field. Dropping this object will thus decrement that refcount, and /// potentially remove the inode from the store (when the refcount reaches 0). /// Note that dropping this object locks its inode store, so care must be taken not to drop strong /// references while the inode store is locked, or to use `StrongInodeReference::drop_unlocked()`. pub(crate) struct StrongInodeReference { /// Referenced inode's data. /// Is only `None` after the inode has been leaked, which cannot occur outside of `leak()` and /// `drop()`, because `leak()` consumes the object. inode_data: Option>, /// Inode store that holds the referenced inode. inode_store: Arc>, } pub(crate) struct InodeData { pub inode: Inode, // Most of these aren't actually files but ¯\_(ツ)_/¯. pub file_or_handle: FileOrHandle, pub refcount: AtomicU64, // Used as key in the `InodeStoreInner::by_ids` map. pub ids: InodeIds, // File type and mode pub mode: u32, // Constructed in the `prepare_serialization` phase of migration, and must be set on all inodes // when we are actually going to serialize our internal state to send it to the migration // destination. // Because this may contain a strong inode reference, which must not be dropped while the inode // store is locked, this info must in turn not be dropped while the store is locked. // To ensure this, locking of the store is only done here in this file, and here we ensure that // while the store is locked, `InodeMigrationInfo` (e.g. as part of an `InodeData`) is dropped // only by using `drop_unlocked()` for a potentially contained strong reference. pub(super) migration_info: Mutex>, } /** * Represents the file associated with an inode (`InodeData`). * * When obtaining such a file, it may either be a new file (the `Owned` variant), in which case the * object's lifetime is static, or it may reference `InodeData.file` (the `Ref` variant), in which * case the object's lifetime is that of the respective `InodeData` object. */ pub(crate) enum InodeFile<'inode_lifetime> { Owned(File), Ref(&'inode_lifetime File), } #[derive(Default)] struct InodeStoreInner { data: BTreeMap>, by_ids: BTreeMap, by_handle: BTreeMap, } #[derive(Default)] pub(crate) struct InodeStore { inner: Arc>, } /** * Iterates over the inode store. * * Does not keep the store locked between `next()` calls, and will return inodes added while * iterating. */ pub(crate) struct InodeIterator<'a> { /// Inode store. store: &'a InodeStore, /** * Last inode ID returned through `next()`. * * We visit inodes in numerical order of their ID, and because new IDs added to the store are * always greater than all previous IDs, all remaining IDs to visit must be greater than this * one. */ last_inode: Option, } /** * Errors that `InodeData::get_path()` can encounter. * * This specialized error type exists so that * [`crate::passthrough::device_state::preserialization::proc_paths`] can decide which errors it * considers recoverable. */ #[derive(Debug)] pub(crate) enum InodePathError { /// Failed to get an FD for this inode. NoFd(io::Error), /// `util::get_path_by_fd()` failed. FdPathError(FdPathError), /// Path reported by `util::get_path_by_fd()` is outside of the shared directory. OutsideRoot, } impl<'a> InodeData { /// Get an `O_PATH` file for this inode pub fn get_file(&'a self) -> io::Result> { match &self.file_or_handle { FileOrHandle::File(f) => Ok(InodeFile::Ref(f)), FileOrHandle::Handle(h) => { let file = h.open(libc::O_PATH)?; Ok(InodeFile::Owned(file)) } FileOrHandle::Invalid(err) => Err(io::Error::new( err.kind(), format!("Inode is invalid because of an error during the preceding migration, which was: {err}"), )), } } /// Try to obtain this inode's path through /proc/self/fd pub fn get_path(&self, proc_self_fd: &File) -> Result { let fd = self.get_file().map_err(InodePathError::NoFd)?; let path = get_path_by_fd(&fd, proc_self_fd).map_err(InodePathError::FdPathError)?; // Kernel will report nodes beyond our root as having path / -- but only the root node (the // shared directory) can actually have that path, so for others, it must be inaccurate if path.as_bytes() == b"/" && self.inode != fuse::ROOT_ID { return Err(InodePathError::OutsideRoot); } Ok(path) } /// Open this inode with the given flags /// (always returns a new (i.e. `Owned`) file, hence the static lifetime) pub fn open_file( &self, flags: libc::c_int, proc_self_fd: &File, ) -> io::Result> { // Do not move the `is_safe_inode()` check up: It is always false for invalid inodes, so // would hide their perfectly good error message match &self.file_or_handle { FileOrHandle::File(f) => { if !is_safe_inode(self.mode) { return Err(ebadf()); } let new_file = reopen_fd_through_proc(f, flags, proc_self_fd)?; Ok(InodeFile::Owned(new_file)) } FileOrHandle::Handle(h) => { if !is_safe_inode(self.mode) { return Err(ebadf()); } let new_file = h.open(flags)?; Ok(InodeFile::Owned(new_file)) } FileOrHandle::Invalid(err) => Err(io::Error::new( err.kind(), format!("Inode is invalid because of an error during the preceding migration, which was: {err}"), )), } } /// Return some human-readable identification of this inode, ideally the path. Will perform /// I/O, so is not extremely cheap to call. pub fn identify(&self, proc_self_fd: &File) -> String { if let Ok(path) = self.get_path(proc_self_fd) { path.to_string_lossy().to_string() } else { let mode = match self.mode & libc::S_IFMT { libc::S_IFREG => "file", libc::S_IFDIR => "directory", libc::S_IFLNK => "symbolic link", libc::S_IFIFO => "FIFO", libc::S_IFSOCK => "socket", libc::S_IFCHR => "character device", libc::S_IFBLK => "block device", _ => "unknown inode type", }; format!( "[{}; mount_id={} device_id={} inode_id={}]", mode, self.ids.mnt_id, self.ids.dev, self.ids.ino, ) } } } impl InodeFile<'_> { /// Create a standalone `File` object pub fn into_file(self) -> io::Result { match self { Self::Owned(file) => Ok(file), Self::Ref(file_ref) => file_ref.try_clone(), } } } impl AsRawFd for InodeFile<'_> { /// Return a file descriptor for this file /// Note: This fd is only valid as long as the `InodeFile` exists. fn as_raw_fd(&self) -> RawFd { match self { Self::Owned(file) => file.as_raw_fd(), Self::Ref(file_ref) => file_ref.as_raw_fd(), } } } impl InodeStoreInner { /// Insert a new entry into the inode store. Panics if the entry already existed. /// (This guarantees that inserting a value will not drop an existing `InodeMigrationInfo` /// object.) fn insert_new(&mut self, data: Arc) { // Overwriting something in `by_ids` or `by_handle` is not exactly what we want, but having // the same physical inode under several different FUSE IDs is not catastrophic, so do not // panic about that. self.by_ids.insert(data.ids, data.inode); if let FileOrHandle::Handle(handle) = &data.file_or_handle { self.by_handle.insert(handle.inner().clone(), data.inode); } let existing = self.data.insert(data.inode, data); assert!(existing.is_none()); } /// Remove the given inode, and, if found, take care to drop any associated strong reference in /// the migration info via `drop_unlocked()`. fn remove(&mut self, inode: Inode) { let data = self.data.remove(&inode); if let Some(data) = data { if let FileOrHandle::Handle(handle) = &data.file_or_handle { self.by_handle.remove(handle.inner()); } self.by_ids.remove(&data.ids); if let Some(mig_info) = data.migration_info.lock().unwrap().take() { mig_info.for_each_strong_reference(|strong_ref| strong_ref.drop_unlocked(self)); } } } fn clear(&mut self) { self.clear_migration_info(); self.data.clear(); self.by_handle.clear(); self.by_ids.clear(); } /// Clears all migration info, using `drop_unlocked()` to drop any strong references within. fn clear_migration_info(&mut self) { let mut strong_references = Vec::::new(); for inode in self.data.values() { if inode.inode == fuse::ROOT_ID { // Ignore root inode, we always want to keep its migration info around continue; } if let Some(mig_info) = inode.migration_info.lock().unwrap().take() { mig_info.for_each_strong_reference(|strong_ref| strong_references.push(strong_ref)); } } for strong_reference in strong_references { strong_reference.drop_unlocked(self); } } fn get(&self, inode: Inode) -> Option<&Arc> { self.data.get(&inode) } fn get_by_ids(&self, ids: &InodeIds) -> Option<&Arc> { self.inode_by_ids(ids).map(|inode| self.get(inode).unwrap()) } fn get_by_handle(&self, handle: &FileHandle) -> Option<&Arc> { self.inode_by_handle(handle) .map(|inode| self.get(inode).unwrap()) } fn contains(&self, inode: Inode) -> bool { self.data.contains_key(&inode) } fn inode_by_ids(&self, ids: &InodeIds) -> Option { self.by_ids.get(ids).copied() } fn inode_by_handle(&self, handle: &FileHandle) -> Option { self.by_handle.get(handle).copied() } fn is_empty(&self) -> bool { self.data.is_empty() } /// Decrement the refcount of the given `inode` ID, and remove it from the store when it /// reaches 0 fn forget_one(&mut self, inode: Inode, count: u64) { if let Some(data) = self.get(inode) { // Having a mutable reference on `self` prevents concurrent lookups from incrementing // the refcount but there is the possibility that a previous lookup already acquired a // reference to the inode data and is in the process of updating the refcount so we // need to loop here until we can decrement successfully. loop { let refcount = data.refcount.load(Ordering::Relaxed); // Saturating sub because it doesn't make sense for a refcount to go below zero and // we don't want misbehaving clients to cause integer overflow. let new_count = refcount.saturating_sub(count); // We don't need any stronger ordering, because the refcount itself doesn't protect // any data. if data.refcount.compare_exchange( refcount, new_count, Ordering::Relaxed, Ordering::Relaxed, ) == Ok(refcount) { if new_count == 0 { // We just removed the last refcount for this inode. There's no need for an // acquire fence here because we have a mutable reference on `self`. So // there's is no other release store for us to synchronize with before // deleting the entry. self.remove(inode); } break; } } } } } impl InodeStore { pub fn get(&self, inode: Inode) -> Option> { self.inner.read().unwrap().get(inode).cloned() } /** * Iterate over every inode that we have in the store. * * Does not keep the store locked between `next()` calls, and will return inodes added while * iterating. */ pub fn iter(&self) -> InodeIterator<'_> { InodeIterator { store: self, last_inode: None, } } /// Turn the weak reference `inode` into a strong one (increments its refcount) pub fn get_strong(&self, inode: Inode) -> io::Result { StrongInodeReference::new(inode, self) } /// Attempt to get an inode from `inodes` and create a strong reference to it, i.e. increment /// its refcount. Return that reference on success, and an error on failure. /// Reasons for failure can be that the inode isn't in the map or that the refcount is zero. /// This function will never increment a refcount that's already zero. /// Note that dropping the returned strong reference will automatically decrement the refcount /// again. pub fn claim_inode( &self, handle: Option<&FileHandle>, ids: &InodeIds, ) -> io::Result { self.do_claim_inode(&self.inner.read().unwrap(), handle, ids) } fn do_claim_inode>( &self, inner: &I, handle: Option<&FileHandle>, ids: &InodeIds, ) -> io::Result { let data = handle .and_then(|h| inner.get_by_handle(h)) .or_else(|| { inner.get_by_ids(ids).filter(|data| { // When we have to fall back to looking up an inode by its inode ID, ensure // that we hit an entry that has a valid file descriptor. Having an FD open // means that the inode cannot really be deleted until the FD is closed, so // that the inode ID remains valid until we evict the `InodeData`. With no FD // open (and just a file handle), the inode can be deleted while we still have // our `InodeData`, and so the inode ID may be reused by a completely different // new inode. Such inodes must be looked up by file handle, because this // handle contains a generation ID to differentiate between the old and the new // inode. matches!(data.file_or_handle, FileOrHandle::File(_)) }) }) .ok_or_else(|| { io::Error::new( io::ErrorKind::NotFound, "Cannot take strong reference to inode by handle or IDs, not found".to_string(), ) })?; StrongInodeReference::new_with_data(Arc::clone(data), self) } /// Check whether a matching inode is already present (see `claim_inode`), and if so, return /// that inode and drop `inode_data`. /// Otherwise, insert `inode_data`, and return a strong reference to it. `inode_data.refcount` /// is ignored; the returned strong reference is the only one that can exist, so the refcount /// is hard-set to 1. pub fn get_or_insert(&self, mut inode_data: InodeData) -> io::Result { let mut inner = self.inner.write().unwrap(); let handle = match &inode_data.file_or_handle { FileOrHandle::File(_) => None, FileOrHandle::Handle(handle) => Some(handle.inner()), FileOrHandle::Invalid(_) => None, }; if let Ok(inode) = self.do_claim_inode(&inner, handle, &inode_data.ids) { // `InodeData`s should not be dropped while the inode store is locked, so drop the lock // before `inode_data` drop(inner); return Ok(inode); } if inner.contains(inode_data.inode) { // `InodeData`s should not be dropped while the inode store is locked, so drop the lock // before `inode_data` drop(inner); return Err(other_io_error(format!( "Double-use of FUSE inode ID {}", inode_data.inode ))); } // Safe because we have the only reference inode_data.refcount = AtomicU64::new(1); let inode_data = Arc::new(inode_data); inner.insert_new(Arc::clone(&inode_data)); // We just set the reference to 1 to account for this Ok(unsafe { StrongInodeReference::new_no_increment(inode_data, self) }) } /// Insert `inode_data` into the inode store regardless of whether a matching inode already /// exists. However, if the given inode ID already exists, return an error and drop /// `inode_data.` pub fn new_inode(&self, inode_data: InodeData) -> io::Result<()> { let mut inner = self.inner.write().unwrap(); if inner.contains(inode_data.inode) { // `InodeData`s should not be dropped while the inode store is locked, so drop the lock // before `inode_data` drop(inner); return Err(other_io_error(format!( "Double-use of FUSE inode ID {}", inode_data.inode ))); } inner.insert_new(Arc::new(inode_data)); Ok(()) } pub fn forget_one(&self, inode: Inode, count: u64) { self.inner.write().unwrap().forget_one(inode, count); } pub fn forget_many>(&self, inodes: I) { let mut inner = self.inner.write().unwrap(); for (inode, count) in inodes { inner.forget_one(inode, count); } } pub fn clear(&self) { self.inner.write().unwrap().clear(); } pub fn clear_migration_info(&self) { self.inner.write().unwrap().clear_migration_info(); } pub fn is_empty(&self) -> bool { self.inner.read().unwrap().is_empty() } } impl StrongInodeReference { /// Create a new strong reference to the given inode in the given inode store, incrementing the /// refcount appropriately. pub fn new(inode: Inode, inode_store: &InodeStore) -> io::Result { let inode_data = inode_store.get(inode).ok_or_else(|| { io::Error::new( io::ErrorKind::NotFound, format!("Cannot take strong reference to inode {inode}: Not found"), ) })?; Self::new_with_data(inode_data, inode_store) } /// Create a new strong reference to an inode with the given data from the given inode store, /// incrementing the refcount appropriately. pub fn new_with_data(inode_data: Arc, inode_store: &InodeStore) -> io::Result { Self::increment_refcount_for(&inode_data)?; // Safe because we have just incremented the refcount Ok(unsafe { StrongInodeReference::new_no_increment(inode_data, inode_store) }) } /// Create a new strong reference to an inode with the given data from the given inode store, /// but do not increment the inode's refcount, and instead assume that the caller has already /// done it. /// /// # Safety /// Caller ensures the inode's refcount is incremented by 1 to account for this strong /// reference. pub unsafe fn new_no_increment(inode_data: Arc, inode_store: &InodeStore) -> Self { StrongInodeReference { inode_data: Some(inode_data), inode_store: Arc::clone(&inode_store.inner), } } /// Tries to increment the refcount in the given `inode_data`, but will refuse to increment a /// refcount that is 0 (because in this case, the inode is already in the process of being /// removed from the store, so continuing to use it would not be safe). fn increment_refcount_for(inode_data: &InodeData) -> io::Result<()> { // Use `.fetch_update()` instead of `.fetch_add()` to ensure we never increment the // refcount from zero to one. match inode_data .refcount .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |rc| { (rc > 0).then_some(rc + 1) }) { Ok(_old_rc) => Ok(()), Err(_old_rc) => Err(io::Error::new( io::ErrorKind::NotFound, format!( "Cannot take strong reference to inode {}: Is already deleted", inode_data.inode ), )), } } /// Consume this strong reference, yield the underlying inode ID, without decrementing the /// inode's refcount. /// /// # Safety /// Caller must guarantee that the refcount is tracked somehow still, i.e. that forget_one() /// will eventually be called. Otherwise, this inode will be truly leaked, which generally is /// not good. pub unsafe fn leak(mut self) -> Inode { // Unwrapping is safe: Every initializer sets this to `Some(_)`, and every function that // `take()`s the value (`leak()`, `drop_unlocked()`, `drop()`) also consumes `self`, so // outside of them, this must always be `None`. self.inode_data.take().unwrap().inode } /// Yield the underlying inode ID. /// /// # Safety /// The inode ID is technically a form of a weak reference. To ensure safety, the caller may /// not assume that it is valid beyond the lifetime of the corresponding strong reference. pub unsafe fn get_raw(&self) -> Inode { // Unwrapping is safe: Every initializer sets this to `Some(_)`, and every function that // `take()`s the value (`leak()`, `drop_unlocked()`, `drop()`) also consumes `self`, so // outside of them, this must always be `None`. self.inode_data.as_ref().unwrap().inode } /// Get the associated inode data. pub fn get(&self) -> &InodeData { // Unwrapping is safe: Every initializer sets this to `Some(_)`, and every function that // `take()`s the value (`leak()`, `drop_unlocked()`, `drop()`) also consumes `self`, so // outside of them, this must always be `None`. self.inode_data.as_ref().unwrap() } /// This function allows dropping a `StrongInodeReference` while the inode store is locked, but /// the caller must have mutable access to the inode store. fn drop_unlocked(mut self, inodes: &mut InodeStoreInner) { if let Some(inode_data) = self.inode_data.take() { inodes.forget_one(inode_data.inode, 1); } } } impl Clone for StrongInodeReference { /// Create an additional strong reference. fn clone(&self) -> Self { // Unwrapping is safe: Every initializer sets this to `Some(_)`, and every function that // `take()`s the value (`leak()`, `drop_unlocked()`, `drop()`) also consumes `self`, so // outside of them, this must always be `None`. let cloned_data = Arc::clone(self.inode_data.as_ref().unwrap()); let cloned_store = Arc::clone(&self.inode_store); // Unwrapping is safe, because this can only fail if the refcount became 0, which is // impossible because `self` is a strong reference Self::increment_refcount_for(&cloned_data).unwrap(); StrongInodeReference { inode_data: Some(cloned_data), inode_store: cloned_store, } } } impl Drop for StrongInodeReference { /// Decrement the refcount on the referenced inode, removing it from the store when the /// refcount reaches 0. /// Note that this function locks `self.inode_store`, so a `StrongInodeReference` must not be /// dropped while that inode store is locked. In such a case, /// `StrongInodeReference::drop_unlocked()` must be used. fn drop(&mut self) { if let Some(inode_data) = self.inode_data.take() { self.inode_store .write() .unwrap() .forget_one(inode_data.inode, 1); } } } impl Drop for InodeStore { /// Explicitly clear the inner inode store on drop, because there may be circular references /// within (in the migration info's strong references) that may otherwise prevent the /// `InodeStoreInner` from being dropped. fn drop(&mut self) { self.inner.write().unwrap().clear(); } } impl Iterator for InodeIterator<'_> { type Item = Arc; fn next(&mut self) -> Option> { let store = self.store.inner.read().unwrap(); // Find the inode with the lowest ID after `last_inode`. // Note that iterators over `BTreeMap` return keys in numerical order, so // `range(x..).next()` will always return the inode with the lowest ID greater than or // equal to `x` (if any). let lower_bound = self.last_inode.map(|last_id| last_id + 1).unwrap_or(0); let (inode_id, inode_data) = store.data.range(lower_bound..).next()?; self.last_inode = Some(*inode_id); Some(Arc::clone(inode_data)) } } impl From for io::Error { fn from(err: InodePathError) -> Self { match err { InodePathError::NoFd(err) => err, InodePathError::FdPathError(err) => err.into(), InodePathError::OutsideRoot => other_io_error( "Got empty path for non-root node, so it is outside the shared directory", ), } } } impl fmt::Display for InodePathError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { InodePathError::NoFd(err) => write!(f, "{err}"), InodePathError::FdPathError(err) => write!(f, "{err}"), InodePathError::OutsideRoot => write!( f, "Got empty path for non-root node, so it is outside the shared directory", ), } } } impl std::error::Error for InodePathError {} virtiofsd-1.13.0/src/passthrough/mod.rs000064400000000000000000003044311046102023000162070ustar 00000000000000// Copyright 2019 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. pub mod credentials; pub mod device_state; pub mod file_handle; pub mod inode_store; pub mod mount_fd; pub mod read_only; pub mod stat; pub mod util; pub mod xattrmap; use crate::filesystem::{ Context, Entry, Extensions, FileSystem, FsOptions, GetxattrReply, ListxattrReply, OpenOptions, SecContext, SetattrValid, SetxattrFlags, ZeroCopyReader, ZeroCopyWriter, }; use crate::passthrough::credentials::{drop_effective_cap, UnixCredentials, UnixCredentialsGuard}; use crate::passthrough::device_state::preserialization::{ self, HandleMigrationInfo, InodeMigrationInfo, }; use crate::passthrough::inode_store::{ Inode, InodeData, InodeFile, InodeIds, InodeStore, StrongInodeReference, }; use crate::passthrough::util::{ ebadf, is_safe_inode, openat, openat_verbose, reopen_fd_through_proc, }; use crate::read_dir::ReadDir; use crate::soft_idmap::{self, GuestGid, GuestUid, HostGid, HostUid, Id, IdMap}; use crate::util::{other_io_error, ResultErrorContext}; use crate::{fuse, oslib}; use file_handle::{FileHandle, FileOrHandle, OpenableFileHandle}; use mount_fd::{MPRError, MountFds}; use stat::{statx, StatExt}; use std::borrow::Cow; use std::collections::{btree_map, BTreeMap}; use std::convert::TryInto; use std::ffi::{CStr, CString}; use std::fs::File; use std::io; use std::io::ErrorKind; use std::mem::MaybeUninit; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use std::str::FromStr; use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::{Arc, Mutex, RwLock}; use std::time::Duration; use xattrmap::{AppliedRule, XattrMap}; const EMPTY_CSTR: &[u8] = b"\0"; type Handle = u64; enum HandleDataFile { File(RwLock), // `io::Error` does not implement `Clone`, so without wrapping it in `Arc`, returning the error // anywhere would be impossible without consuming it Invalid(Arc), } struct HandleData { inode: Inode, file: HandleDataFile, // On migration, must be set when we serialize our internal state to send it to the // destination. As long as `HandleMigrationInfo::new()` is cheap, we may as well // keep it always set. migration_info: HandleMigrationInfo, } struct ScopedWorkingDirectory { back_to: RawFd, } impl ScopedWorkingDirectory { fn new(new_wd: RawFd, old_wd: RawFd) -> ScopedWorkingDirectory { oslib::fchdir(new_wd).expect("the working directory should be changed"); ScopedWorkingDirectory { back_to: old_wd } } } impl Drop for ScopedWorkingDirectory { fn drop(&mut self) { oslib::fchdir(self.back_to).expect("the working directory should be changed"); } } fn set_working_directory(new_wd: RawFd, old_wd: RawFd) -> ScopedWorkingDirectory { ScopedWorkingDirectory::new(new_wd, old_wd) } /// The caching policy that the file system should report to the FUSE client. By default the FUSE /// protocol uses close-to-open consistency. This means that any cached contents of the file are /// invalidated the next time that file is opened. #[derive(Default, Debug, Clone)] pub enum CachePolicy { /// The client should never cache file data and all I/O should be directly forwarded to the /// server. This policy must be selected when file contents may change without the knowledge of /// the FUSE client (i.e., the file system does not have exclusive access to the directory). Never, /// This is almost same as Never, but it allows page cache of directories, dentries and attr /// cache in guest. In other words, it acts like cache=never for normal files, and like /// cache=always for directories, besides, metadata like dentries and attrs are kept as well. /// This policy can be used if: /// 1. the client wants to use Never policy but it's performance in I/O is not good enough /// 2. the file system has exclusive access to the directory /// 3. cache directory content and other fs metadata can make a difference on performance. Metadata, /// The client is free to choose when and how to cache file data. This is the default policy and /// uses close-to-open consistency as described in the enum documentation. #[default] Auto, /// The client should always cache file data. This means that the FUSE client will not /// invalidate any cached data that was returned by the file system the last time the file was /// opened. This policy should only be selected when the file system has exclusive access to the /// directory. Always, } impl FromStr for CachePolicy { type Err = &'static str; fn from_str(s: &str) -> Result { match &s.to_lowercase()[..] { "never" => Ok(CachePolicy::Never), "metadata" => Ok(CachePolicy::Metadata), "auto" => Ok(CachePolicy::Auto), "always" => Ok(CachePolicy::Always), _ => Err("invalid cache policy"), } } } /// When to use file handles to reference inodes instead of `O_PATH` file descriptors. #[derive(Default, Debug, Copy, Clone, PartialEq, Eq)] pub enum InodeFileHandlesMode { /// Never use file handles, always use `O_PATH` file descriptors. #[default] Never, /// Attempt to generate file handles, but fall back to `O_PATH` file descriptors where the /// underlying filesystem does not support file handles. Prefer, /// Always use file handles, never fall back to `O_PATH` file descriptors. Mandatory, } /// What to do when an error occurs during migration (checked on the migration destination) #[derive(Default, Debug, Copy, Clone, PartialEq, Eq)] pub enum MigrationOnError { /// Whenever any failure occurs, return a hard error to the vhost-user front-end (e.g. QEMU), /// aborting migration. #[default] Abort, /// Let migration finish, but the guest will be unable to access any of the files that were /// failed to be found/opened, receiving only errors. GuestError, } impl FromStr for MigrationOnError { type Err = &'static str; fn from_str(s: &str) -> std::result::Result { match s { "abort" => Ok(MigrationOnError::Abort), "guest-error" => Ok(MigrationOnError::GuestError), _ => Err("invalid migration-on-error value"), } } } /// How to migrate our internal state to the destination instance #[derive(Default, Debug, Copy, Clone, PartialEq, Eq)] pub enum MigrationMode { /** * Obtain paths for all inodes indexed and opened by the guest, and transfer those paths to * the destination. * * To get those paths, we try to read the symbolic links in /proc/self/fd first; if that does * not work, we will fall back to iterating through the shared directory (exhaustive search), * enumerating all paths within. */ #[default] FindPaths, /// Transfer inodes by their file handles. FileHandles, } impl FromStr for MigrationMode { type Err = &'static str; fn from_str(s: &str) -> std::result::Result { match s { "find-paths" => Ok(MigrationMode::FindPaths), "file-handles" => Ok(MigrationMode::FileHandles), _ => Err("invalid migration-mode value"), } } } /// Options that configure the behavior of the file system. #[derive(Debug)] pub struct Config { /// How long the FUSE client should consider directory entries to be valid. If the contents of a /// directory can only be modified by the FUSE client (i.e., the file system has exclusive /// access), then this should be a large value. /// /// The default value for this option is 5 seconds. pub entry_timeout: Duration, /// How long the FUSE client should consider file and directory attributes to be valid. If the /// attributes of a file or directory can only be modified by the FUSE client (i.e., the file /// system has exclusive access), then this should be set to a large value. /// /// The default value for this option is 5 seconds. pub attr_timeout: Duration, /// The caching policy the file system should use. See the documentation of `CachePolicy` for /// more details. pub cache_policy: CachePolicy, /// Whether the file system should enabled writeback caching. This can improve performance as it /// allows the FUSE client to cache and coalesce multiple writes before sending them to the file /// system. However, enabling this option can increase the risk of data corruption if the file /// contents can change without the knowledge of the FUSE client (i.e., the server does **NOT** /// have exclusive access). Additionally, the file system should have read access to all files /// in the directory it is serving as the FUSE client may send read requests even for files /// opened with `O_WRONLY`. /// /// Therefore callers should only enable this option when they can guarantee that: 1) the file /// system has exclusive access to the directory and 2) the file system has read permissions for /// all files in that directory. /// /// The default value for this option is `false`. pub writeback: bool, /// The path of the root directory. /// /// The default is `/`. pub root_dir: String, /// A prefix to strip from the mount points listed in /proc/self/mountinfo. /// /// The default is `None`. pub mountinfo_prefix: Option, /// Whether the file system should support Extended Attributes (xattr). Enabling this feature may /// have a significant impact on performance, especially on write parallelism. This is the result /// of FUSE attempting to remove the special file privileges after each write request. /// /// The default value for this options is `false`. pub xattr: bool, /// An optional translation layer for host<->guest Extended Attribute (xattr) names. pub xattrmap: Option, /// The xattr name that "security.capability" is remapped to, if the client remapped it at all. /// If the client's xattrmap did not remap "security.capability", this will be `None`. pub xattr_security_capability: Option, /// Optional `File` object for /proc/self/fd. Callers can open a `File` and pass it here, so /// there's no need to open it in PassthroughFs::new(). This is specially useful for /// sandboxing. /// /// The default is `None`. pub proc_sfd_rawfd: Option, /// Optional `File` object for /proc/self/mountinfo. Callers can open a `File` and pass it /// here, so there is no need to open it in PassthroughFs::new(). This is especially useful /// for sandboxing. /// /// The default is `None`. pub proc_mountinfo_rawfd: Option, /// Whether the file system should announce submounts to the guest. Not doing so means that /// the FUSE client may see st_ino collisions: This stat field is passed through, so if the /// shared directory encompasses multiple mounts, some inodes (in different file systems) may /// have the same st_ino value. If the FUSE client does not know these inodes are in different /// file systems, then it will be oblivious to this collision. /// By announcing submount points, the FUSE client can create virtual submounts with distinct /// st_dev values where necessary, so that the combination of st_dev and st_ino will stay /// unique. /// On the other hand, it may be undesirable to let the client know the shared directory's /// submount structure. The user needs to decide which drawback weighs heavier for them, which /// is why this is a configurable option. /// /// The default is `false`. pub announce_submounts: bool, /// Whether to use file handles to reference inodes. We need to be able to open file /// descriptors for arbitrary inodes, and by default that is done by storing an `O_PATH` FD in /// `InodeData`. Not least because there is a maximum number of FDs a process can have open /// users may find it preferable to store a file handle instead, which we can use to open an FD /// when necessary. /// So this switch allows to choose between the alternatives: When set to `Never`, `InodeData` /// will store `O_PATH` FDs. Otherwise, we will attempt to generate and store a file handle /// instead. With `Prefer`, errors that are inherent to file handles (like no support from the /// underlying filesystem) lead to falling back to `O_PATH` FDs, and only generic errors (like /// `ENOENT` or `ENOMEM`) are passed to the guest. `Mandatory` enforces the use of file /// handles, returning all errors to the guest. /// /// The default is `Never`. pub inode_file_handles: InodeFileHandlesMode, /// Whether the file system should support READDIRPLUS (READDIR+LOOKUP) operations. /// /// The default is `false`. pub readdirplus: bool, /// Whether the file system should honor the O_DIRECT flag. If this option is disabled (which /// is the default value), that flag will be filtered out at `open_inode`. /// /// The default is `false`. pub allow_direct_io: bool, /// If `killpriv_v2` is true then it indicates that the file system is expected to clear the /// setuid and setgid bits. pub killpriv_v2: bool, /// Enable support for posix ACLs /// /// The default is `false`. pub posix_acl: bool, /// If `security_label` is true, then server will indicate to client /// to send any security context associated with file during file /// creation and set that security context on newly created file. /// This security context is expected to be security.selinux. /// /// The default is `false`. pub security_label: bool, /// If `clean_noatime` is true automatically clean up O_NOATIME flag to prevent potential /// permission errors. pub clean_noatime: bool, /// If `allow_mmap` is true, then server will allow shared mmap'ing of files opened/created /// with DIRECT_IO. /// /// The default is `false`. pub allow_mmap: bool, /// Defines what happens when restoring our internal state on the destination fails. /// /// The default is `Abort`. pub migration_on_error: MigrationOnError, /// Whether to store a file handle for each inode in the migration stream, alongside the /// information on how to find the inode. The destination must generate the file handle for /// the inode it has opened and verify they match. /// /// The default is `false`. pub migration_verify_handles: bool, /// Whether to confirm (for path-based migration) at serialization (during switch-over) whether /// the paths still match the inodes they are supposed to represent, and if they do not, try to /// correct the path via the respective symlink in /proc/self/fd. /// /// The default is `false`. pub migration_confirm_paths: bool, /// Defines how to migrate our internal state to the destination instance. /// /// The default is `FindPaths`. pub migration_mode: MigrationMode, /** * UID map parameters given on the command line. * * Is `take()`n when `PassthroughFs` is created, i.e. `None` during runtime. */ pub uid_map: Option>, /** * GID map parameters given on the command line. * * Is `take()`n when `PassthroughFs` is created, i.e. `None` during runtime. */ pub gid_map: Option>, } impl Default for Config { fn default() -> Self { Config { entry_timeout: Duration::from_secs(5), attr_timeout: Duration::from_secs(5), cache_policy: Default::default(), writeback: false, root_dir: String::from("/"), mountinfo_prefix: None, xattr: false, xattrmap: None, xattr_security_capability: None, proc_sfd_rawfd: None, proc_mountinfo_rawfd: None, announce_submounts: false, inode_file_handles: Default::default(), readdirplus: true, allow_direct_io: false, killpriv_v2: false, posix_acl: false, security_label: false, clean_noatime: true, allow_mmap: false, migration_on_error: MigrationOnError::Abort, migration_verify_handles: false, migration_confirm_paths: false, migration_mode: MigrationMode::FindPaths, uid_map: None, gid_map: None, } } } /// A file system that simply "passes through" all requests it receives to the underlying file /// system. To keep the implementation simple it servers the contents of its root directory. Users /// that wish to serve only a specific directory should set up the environment so that that /// directory ends up as the root of the file system process. One way to accomplish this is via a /// combination of mount namespaces and the pivot_root system call. pub struct PassthroughFs { // File descriptors for various points in the file system tree. These fds are always opened with // the `O_PATH` option so they cannot be used for reading or writing any data. See the // documentation of the `O_PATH` flag in `open(2)` for more details on what one can and cannot // do with an fd opened with this flag. inodes: InodeStore, next_inode: AtomicU64, // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be // used for reading and writing data. handles: RwLock>>, next_handle: AtomicU64, // Maps mount IDs to an open FD on the respective ID for the purpose of open_by_handle_at(). // This is set when inode_file_handles is not never, since in the 'never' case, // open_by_handle_at() is not called. mount_fds: Option, // File descriptor pointing to the `/proc/self/fd` directory. This is used to convert an fd from // `inodes` into one that can go into `handles`. This is accomplished by reading the // `/proc/self/fd/{}` symlink. We keep an open fd here in case the file system tree that we are // meant to be serving doesn't have access to `/proc/self/fd`. proc_self_fd: File, // File descriptor pointing to the `/` directory. root_fd: File, // Whether writeback caching is enabled for this directory. This will only be true when // `cfg.writeback` is true and `init` was called with `FsOptions::WRITEBACK_CACHE`. writeback: AtomicBool, // Whether to announce submounts (i.e., whether the guest supports them and whether they are // enabled in the configuration) announce_submounts: AtomicBool, // Whether posix ACLs is enabled. posix_acl: AtomicBool, // Basic facts about the OS os_facts: oslib::OsFacts, // Whether the guest kernel supports the supplementary group extension. sup_group_extension: AtomicBool, // Whether we are preparing for migration and need to track changes to inodes like renames. We // should then also make sure newly created inodes immediately have their migration info set. track_migration_info: AtomicBool, cfg: Config, /// Map to translate between host and guest UIDs. uid_map: IdMap, /// Map to translate between host and guest GIDs. gid_map: IdMap, } impl PassthroughFs { pub fn new(mut cfg: Config) -> io::Result { let proc_self_fd = if let Some(fd) = cfg.proc_sfd_rawfd.take() { fd } else { openat_verbose( &libc::AT_FDCWD, "/proc/self/fd", libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC, )? }; let root_fd = openat_verbose( &libc::AT_FDCWD, "/", libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC, )?; let mount_fds = if cfg.inode_file_handles == InodeFileHandlesMode::Never && cfg.migration_mode != MigrationMode::FileHandles { None } else { let mountinfo_fd = if let Some(fd) = cfg.proc_mountinfo_rawfd.take() { fd } else { openat_verbose( &libc::AT_FDCWD, "/proc/self/mountinfo", libc::O_RDONLY | libc::O_NOFOLLOW | libc::O_CLOEXEC, )? }; Some(MountFds::new(mountinfo_fd, cfg.mountinfo_prefix.clone())) }; let uid_map = if let Some(map) = cfg.uid_map.take() { map.try_into().err_context(|| "UID map")? } else { IdMap::empty() }; let gid_map = if let Some(map) = cfg.gid_map.take() { map.try_into().err_context(|| "GID map")? } else { IdMap::empty() }; let mut fs = PassthroughFs { inodes: Default::default(), next_inode: AtomicU64::new(fuse::ROOT_ID + 1), handles: RwLock::new(BTreeMap::new()), next_handle: AtomicU64::new(0), mount_fds, proc_self_fd, root_fd, writeback: AtomicBool::new(false), announce_submounts: AtomicBool::new(false), posix_acl: AtomicBool::new(false), sup_group_extension: AtomicBool::new(false), os_facts: oslib::OsFacts::new(), track_migration_info: AtomicBool::new(false), cfg, uid_map, gid_map, }; // Check to see if the client remapped "security.capability", if so, // stash its mapping since the daemon will have to enforce semantics // that the host kernel otherwise would if the xattrname was not mapped. let sec_xattr = unsafe { CStr::from_bytes_with_nul_unchecked(b"security.capability\0") }; fs.cfg.xattr_security_capability = fs .map_client_xattrname(sec_xattr) .ok() .filter(|n| !sec_xattr.eq(n)) .map(CString::from); fs.check_working_file_handles()?; // We need to clear the umask here because we want the client to be // able to set all the bits in the mode. oslib::umask(0o000); Ok(fs) } pub fn keep_fds(&self) -> Vec { vec![self.proc_self_fd.as_raw_fd()] } fn open_relative_to( &self, dir: &impl AsRawFd, pathname: &CStr, flags: i32, mode: Option, ) -> io::Result { let flags = libc::O_NOFOLLOW | libc::O_CLOEXEC | flags; if self.os_facts.has_openat2 { oslib::do_open_relative_to(dir, pathname, flags, mode) } else { oslib::openat(dir, pathname, flags, mode) } } fn find_handle(&self, handle: Handle, inode: Inode) -> io::Result> { self.handles .read() .unwrap() .get(&handle) .filter(|hd| hd.inode == inode) .cloned() .ok_or_else(ebadf) } fn open_inode(&self, inode: Inode, mut flags: i32) -> io::Result { let data = self.inodes.get(inode).ok_or_else(ebadf)?; // When writeback caching is enabled, the kernel may send read requests even if the // userspace program opened the file write-only. So we need to ensure that we have opened // the file for reading as well as writing. let writeback = self.writeback.load(Ordering::Relaxed); if writeback && flags & libc::O_ACCMODE == libc::O_WRONLY { flags &= !libc::O_ACCMODE; flags |= libc::O_RDWR; } // When writeback caching is enabled the kernel is responsible for handling `O_APPEND`. // However, this breaks atomicity as the file may have changed on disk, invalidating the // cached copy of the data in the kernel and the offset that the kernel thinks is the end of // the file. Just allow this for now as it is the user's responsibility to enable writeback // caching only for directories that are not shared. It also means that we need to clear the // `O_APPEND` flag. if writeback && flags & libc::O_APPEND != 0 { flags &= !libc::O_APPEND; } if !self.cfg.allow_direct_io && flags & libc::O_DIRECT != 0 { flags &= !libc::O_DIRECT; } data.open_file(flags | libc::O_CLOEXEC, &self.proc_self_fd)? .into_file() } /// Generate a file handle for `fd` using `FileHandle::from_fd()`. `st` is `fd`'s stat /// information (we may need the mount ID for errors/warnings). /// /// These are the possible return values: /// - `Ok(Some(_))`: Success, caller should use this file handle. /// - `Ok(None)`: No error, but no file handle is available. The caller should fall back to /// using an `O_PATH` FD. /// - `Err(_)`: An error occurred, the caller should return this to the guest. /// /// This function takes the chosen `self.cfg.inode_file_handles` mode into account: /// - `Never`: Always return `Ok(None)`. /// - `Prefer`: Return `Ok(None)` when file handles are not supported by this filesystem. /// Otherwise, return either `Ok(Some(_))` or `Err(_)`, depending on whether a file /// handle could be generated or not. /// - `Mandatory`: Never return `Ok(None)`. When the filesystem does not support file handles, /// return an `Err(_)`. /// /// When the filesystem does not support file handles, this is logged (as a warning in /// `Prefer` mode, and as an error in `Mandatory` mode) one time per filesystem. fn get_file_handle_opt( &self, fd: &impl AsRawFd, st: &StatExt, ) -> io::Result> { let handle = match self.cfg.inode_file_handles { InodeFileHandlesMode::Never => { // Let's make this quick, so we can skip this case below return Ok(None); } InodeFileHandlesMode::Prefer | InodeFileHandlesMode::Mandatory => { FileHandle::from_fd(fd)? } }; if handle.is_none() { // No error, but no handle (because of EOPNOTSUPP/EOVERFLOW)? Log it. let io_err = io::Error::from_raw_os_error(libc::EOPNOTSUPP); let desc = match self.cfg.inode_file_handles { InodeFileHandlesMode::Never => unreachable!(), InodeFileHandlesMode::Prefer => { "Filesystem does not support file handles, falling back to O_PATH FDs" } InodeFileHandlesMode::Mandatory => "Filesystem does not support file handles", }; // Use the MPRError object, because (with a mount ID obtained through statx()) // `self.mount_fds.error_for()` will attempt to add a prefix to the error description // that describes the offending filesystem by mount point and mount ID, and will also // suppress the message if we have already logged any error concerning file handles for // the respective filesystem (so we only log errors/warnings once). let err: MPRError = if st.mnt_id > 0 { // Valid mount ID // self.mount_fds won't be None if we enter here. self.mount_fds .as_ref() .unwrap() .error_for(st.mnt_id, io_err) } else { // No valid mount ID, return error object not bound to a filesystem io_err.into() } .set_desc(desc.to_string()); // In `Prefer` mode, warn; in `Mandatory` mode, log and return an error. // (Suppress logging if the error is silenced, which means that we have already logged // a warning/error for this filesystem.) match self.cfg.inode_file_handles { InodeFileHandlesMode::Never => unreachable!(), InodeFileHandlesMode::Prefer => { if !err.silent() { warn!("{}", err); } } InodeFileHandlesMode::Mandatory => { if !err.silent() { error!("{}", err); } return Err(err.into_inner()); } } } Ok(handle) } fn make_file_handle_openable(&self, fh: &FileHandle) -> io::Result { // self.mount_fds won't be None if we enter here. fh.to_openable(self.mount_fds.as_ref().unwrap(), |fd, flags| { reopen_fd_through_proc(&fd, flags, &self.proc_self_fd) }) .map_err(|e| { if !e.silent() { error!("{}", e); } e.into_inner() }) } fn check_working_file_handles(&mut self) -> io::Result<()> { if self.cfg.inode_file_handles == InodeFileHandlesMode::Never { // No need to check anything return Ok(()); } // Try to open the root directory, turn it into a file handle, then try to open that file // handle to see whether file handles do indeed work // (Note that we pass through all I/O errors to the caller, because `PassthroughFs::init()` // will do these calls (`openat()`, `stat()`, etc.) anyway, so if they do not work now, // they probably are not going to work later either. Better to report errors early then.) let root_dir = openat_verbose( &libc::AT_FDCWD, self.cfg.root_dir.as_str(), libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC, )?; let st = statx(&root_dir, None)?; if let Some(h) = self.get_file_handle_opt(&root_dir, &st)? { // Got an openable file handle, try opening it match self.make_file_handle_openable(&h)?.open(libc::O_PATH) { Ok(_) => (), Err(e) => match self.cfg.inode_file_handles { InodeFileHandlesMode::Never => unreachable!(), InodeFileHandlesMode::Prefer => { warn!("Failed to open file handle for the root node: {}", e); warn!("File handles do not appear safe to use, disabling file handles altogether"); self.cfg.inode_file_handles = InodeFileHandlesMode::Never; } InodeFileHandlesMode::Mandatory => { error!("Failed to open file handle for the root node: {}", e); error!("Refusing to use (mandatory) file handles, as they do not appear safe to use"); return Err(e); } }, } } else { // Did not get an openable file handle (nor an error), so we cannot be in `mandatory` // mode. We also cannot be in `never` mode, because that is sorted out at the very // beginning of this function. Still, use `match` so the compiler could warn us if we // were to forget some (future?) variant. match self.cfg.inode_file_handles { InodeFileHandlesMode::Never => unreachable!(), InodeFileHandlesMode::Prefer => { warn!("Failed to generate a file handle for the root node, disabling file handles altogether"); self.cfg.inode_file_handles = InodeFileHandlesMode::Never; } InodeFileHandlesMode::Mandatory => unreachable!(), } } Ok(()) } /// Try to look up an inode by its `name` relative to the given `parent` inode. If the inode /// is registered in our inode store (`self.inodes`), return a strong reference to it. /// Otherwise, return `None` instead. /// Along with the inode (or `None`), return information gathered along the way: An `O_PATH` /// file to the inode, stat information, and optionally a file handle if virtiofsd has been /// configured to use file handles. /// Return an error if the parent node cannot be opened, the given inode cannot be found on the /// filesystem, or generating the stat information or file handle fails. fn try_lookup_implementation( &self, parent_data: &InodeData, name: &CStr, ) -> io::Result<( Option, File, StatExt, Option, )> { let p_file = parent_data.get_file()?; let path_fd = { let fd = self.open_relative_to(&p_file, name, libc::O_PATH, None)?; // Safe because we just opened this fd. unsafe { File::from_raw_fd(fd) } }; let st = statx(&path_fd, None)?; // Note that this will always be `None` if `cfg.inode_file_handles` is `Never`, but we only // really need the handle when we do not have an `O_PATH` fd open for every inode. So if // `cfg.inode_file_handles` is `Never`, we do not need it anyway. let handle = self.get_file_handle_opt(&path_fd, &st)?; let ids = InodeIds { ino: st.st.st_ino, dev: st.st.st_dev, mnt_id: st.mnt_id, }; Ok(( self.inodes.claim_inode(handle.as_ref(), &ids).ok(), path_fd, st, handle, )) } /// Try to look up an inode by its `name` relative to the parent inode given by its /// `parent_data`. If the inode is registered in our inode store (`self.inodes`), return a /// strong reference to it. Otherwise, return `None`. /// Return an error if the parent node cannot be opened, the given inode cannot be found on the /// filesystem, or generating the Stat information or file handle fails. fn try_lookup( &self, parent_data: &InodeData, name: &CStr, ) -> io::Result> { self.try_lookup_implementation(parent_data, name) .map(|result| result.0) } fn do_lookup(&self, parent: Inode, name: &CStr) -> io::Result { let p = self.inodes.get(parent).ok_or_else(ebadf)?; let (existing_inode, path_fd, st, handle) = self.try_lookup_implementation(&p, name)?; let mut attr_flags: u32 = 0; if st.st.st_mode & libc::S_IFMT == libc::S_IFDIR && self.announce_submounts.load(Ordering::Relaxed) && (st.st.st_dev != p.ids.dev || st.mnt_id != p.ids.mnt_id) { attr_flags |= fuse::ATTR_SUBMOUNT; } let inode = if let Some(inode) = existing_inode { inode } else { let file_or_handle = if let Some(h) = handle.as_ref() { FileOrHandle::Handle(self.make_file_handle_openable(h)?) } else { FileOrHandle::File(path_fd) }; let mig_info = if self.track_migration_info.load(Ordering::Relaxed) { let parent_strong_ref = StrongInodeReference::new_with_data(p, &self.inodes)?; Some(InodeMigrationInfo::new( &self.cfg, parent_strong_ref, name, &file_or_handle, )?) } else { None }; let inode_data = InodeData { inode: self.next_inode.fetch_add(1, Ordering::Relaxed), file_or_handle, refcount: AtomicU64::new(1), ids: InodeIds { ino: st.st.st_ino, dev: st.st.st_dev, mnt_id: st.mnt_id, }, mode: st.st.st_mode, migration_info: Mutex::new(mig_info), }; self.inodes.get_or_insert(inode_data)? }; let attr = fuse::Attr::try_with_flags( st.st, attr_flags, |uid| self.map_host_uid(uid), |gid| self.map_host_gid(gid), )?; Ok(Entry { // By leaking, we transfer ownership of this refcount to the guest. That is safe, // because the guest is expected to explicitly release its reference and decrement the // refcount via `FORGET` later. inode: unsafe { inode.leak() }, generation: 0, attr, attr_timeout: self.cfg.attr_timeout, entry_timeout: self.cfg.entry_timeout, }) } fn do_open( &self, inode: Inode, kill_priv: bool, flags: u32, ) -> io::Result<(Option, OpenOptions)> { // We need to clean the `O_APPEND` flag in case the file is mem mapped or if the flag // is later modified in the guest using `fcntl(F_SETFL)`. We do a per-write `O_APPEND` // check setting `RWF_APPEND` for non-mmapped writes, if necessary. let mut flags = flags & !(libc::O_APPEND as u32); // Clean O_NOATIME (unless specified otherwise with --preserve-noatime) to prevent // potential permission errors when running in unprivileged mode. if self.cfg.clean_noatime { flags &= !(libc::O_NOATIME as u32) } let file = { let _killpriv_guard = if self.cfg.killpriv_v2 && kill_priv { drop_effective_cap("FSETID")? } else { None }; self.open_inode(inode, flags as i32)? }; if flags & (libc::O_TRUNC as u32) != 0 { self.clear_file_capabilities(file.as_raw_fd(), false)?; } let handle = self.next_handle.fetch_add(1, Ordering::Relaxed); let data = HandleData { inode, file: file.into(), migration_info: HandleMigrationInfo::new(flags as i32), }; self.handles.write().unwrap().insert(handle, Arc::new(data)); let mut opts = OpenOptions::empty(); match self.cfg.cache_policy { // We only set the direct I/O option on files. CachePolicy::Never => opts.set( OpenOptions::DIRECT_IO, flags & (libc::O_DIRECTORY as u32) == 0, ), CachePolicy::Metadata => { if flags & (libc::O_DIRECTORY as u32) == 0 { opts |= OpenOptions::DIRECT_IO; } else { opts |= OpenOptions::CACHE_DIR | OpenOptions::KEEP_CACHE; } } CachePolicy::Always => { opts |= OpenOptions::KEEP_CACHE; if flags & (libc::O_DIRECTORY as u32) != 0 { opts |= OpenOptions::CACHE_DIR; } } _ => {} }; Ok((Some(handle), opts)) } fn do_release(&self, inode: Inode, handle: Handle) -> io::Result<()> { let mut handles = self.handles.write().unwrap(); if let btree_map::Entry::Occupied(e) = handles.entry(handle) { if e.get().inode == inode { // We don't need to close the file here because that will happen automatically when // the last `Arc` is dropped. e.remove(); return Ok(()); } } Err(ebadf()) } fn do_getattr(&self, inode: Inode) -> io::Result<(fuse::Attr, Duration)> { let data = self.inodes.get(inode).ok_or_else(ebadf)?; let inode_file = data.get_file()?; let st = statx(&inode_file, None)?.st; let attr = fuse::Attr::try_from_stat64( st, |uid| self.map_host_uid(uid), |gid| self.map_host_gid(gid), )?; Ok((attr, self.cfg.attr_timeout)) } fn do_unlink(&self, parent: Inode, name: &CStr, flags: libc::c_int) -> io::Result<()> { let data = self.inodes.get(parent).ok_or_else(ebadf)?; let parent_file = data.get_file()?; let invalidated_inode = self.before_invalidating_path(&data, name); // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::unlinkat(parent_file.as_raw_fd(), name.as_ptr(), flags) }; if let Some(invalidated_inode) = invalidated_inode { self.after_invalidating_path(invalidated_inode, "Unlinked"); } if res == 0 { Ok(()) } else { Err(io::Error::last_os_error()) } } fn block_xattr(&self, name: &[u8]) -> bool { // Currently we only filter out posix acl xattrs. // If acls are enabled, there is nothing to filter. if self.posix_acl.load(Ordering::Relaxed) { return false; } let acl_access = "system.posix_acl_access".as_bytes(); let acl_default = "system.posix_acl_default".as_bytes(); acl_access.starts_with(name) || acl_default.starts_with(name) } fn map_client_xattrname<'a>(&self, name: &'a CStr) -> std::io::Result> { if self.block_xattr(name.to_bytes()) { return Err(io::Error::from_raw_os_error(libc::ENOTSUP)); } match &self.cfg.xattrmap { Some(map) => match map.map_client_xattr(name).expect("unterminated mapping") { AppliedRule::Deny => Err(io::Error::from_raw_os_error(libc::EPERM)), AppliedRule::Unsupported => Err(io::Error::from_raw_os_error(libc::ENOTSUP)), AppliedRule::Pass(new_name) => Ok(new_name), }, None => Ok(Cow::Borrowed(name)), } } fn map_server_xattrlist(&self, xattr_names: Vec) -> Vec { let all_xattrs = match &self.cfg.xattrmap { Some(map) => map .map_server_xattrlist(xattr_names) .expect("unterminated mapping"), None => xattr_names, }; // filter out the blocked xattrs let mut filtered = Vec::with_capacity(all_xattrs.len()); let all_xattrs = all_xattrs.split(|b| *b == 0).filter(|bs| !bs.is_empty()); for xattr in all_xattrs { if !self.block_xattr(xattr) { filtered.extend_from_slice(xattr); filtered.push(0); } } filtered.shrink_to_fit(); filtered } /// Clears file capabilities /// /// * `fd` - A file descriptor /// * `o_path` - Must be `true` if the file referred to by `fd` was opened with the `O_PATH` flag /// /// If it is not clear whether `fd` was opened with `O_PATH` it is safe to set `o_path` /// to `true`. fn clear_file_capabilities(&self, fd: RawFd, o_path: bool) -> io::Result<()> { match self.cfg.xattr_security_capability.as_ref() { // Unmapped, let the kernel take care of this. None => Ok(()), // Otherwise we have to uphold the same semantics the kernel // would; which is to drop the "security.capability" xattr // on write Some(xattrname) => { let res = if o_path { let proc_file_name = CString::new(format!("{fd}")) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; let _working_dir_guard = set_working_directory( self.proc_self_fd.as_raw_fd(), self.root_fd.as_raw_fd(), ); unsafe { libc::removexattr(proc_file_name.as_ptr(), xattrname.as_ptr()) } } else { unsafe { libc::fremovexattr(fd, xattrname.as_ptr()) } }; if res == 0 { Ok(()) } else { let eno = io::Error::last_os_error(); match eno.raw_os_error().unwrap() { libc::ENODATA | libc::ENOTSUP => Ok(()), _ => Err(eno), } } } } } /// Clears S_ISGID from file mode /// /// * `file` - file reference (must implement AsRawFd) /// * `o_path` - Must be `true` if the file referred to by `fd` was opened with the `O_PATH` flag /// /// If it is not clear whether `fd` was opened with `O_PATH` it is safe to set `o_path` /// to `true`. fn clear_sgid(&self, file: &impl AsRawFd, o_path: bool) -> io::Result<()> { let fd = file.as_raw_fd(); let st = statx(file, None)?.st; if o_path { oslib::fchmodat( self.proc_self_fd.as_raw_fd(), format!("{fd}"), st.st_mode & 0o7777 & !libc::S_ISGID, 0, ) } else { oslib::fchmod(fd, st.st_mode & 0o7777 & !libc::S_ISGID) } } #[allow(clippy::too_many_arguments)] fn do_create( &self, ctx: &Context, parent_file: &InodeFile, name: &CStr, mode: u32, flags: u32, umask: u32, extensions: Extensions, ) -> io::Result { let fd = { let _credentials_guard = self.unix_credentials_guard(ctx, &extensions)?; let _umask_guard = self .posix_acl .load(Ordering::Relaxed) .then(|| oslib::ScopedUmask::new(umask)); // Add libc:O_EXCL to ensure we're not accidentally opening a file the guest wouldn't // be allowed to access otherwise. self.open_relative_to( parent_file, name, flags as i32 | libc::O_CREAT | libc::O_EXCL, mode.into(), )? }; // Set security context if let Some(secctx) = extensions.secctx { // Remap security xattr name. let xattr_name = match self.map_client_xattrname(&secctx.name) { Ok(xattr_name) => xattr_name, Err(e) => { unsafe { libc::unlinkat(parent_file.as_raw_fd(), name.as_ptr(), 0); } return Err(e); } }; let ret = unsafe { libc::fsetxattr( fd, xattr_name.as_ptr(), secctx.secctx.as_ptr() as *const libc::c_void, secctx.secctx.len(), 0, ) }; if ret != 0 { unsafe { libc::unlinkat(parent_file.as_raw_fd(), name.as_ptr(), 0); } return Err(io::Error::last_os_error()); } } Ok(fd) } fn do_mknod_mkdir_symlink_secctx( &self, parent_file: &InodeFile, name: &CStr, secctx: &SecContext, ) -> io::Result<()> { // Remap security xattr name. let xattr_name = self.map_client_xattrname(&secctx.name)?; // Set security context on newly created node. It could be // device node as well, so it is not safe to open the node // and call fsetxattr(). Instead, use the fchdir(proc_fd) // and call setxattr(o_path_fd). We use this trick while // setting xattr as well. // Open O_PATH fd for dir/symlink/special node just created. let path_fd = self.open_relative_to(parent_file, name, libc::O_PATH, None)?; let procname = CString::new(format!("{path_fd}")) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)); let procname = match procname { Ok(name) => name, Err(error) => { return Err(error); } }; let _working_dir_guard = set_working_directory(self.proc_self_fd.as_raw_fd(), self.root_fd.as_raw_fd()); let res = unsafe { libc::setxattr( procname.as_ptr(), xattr_name.as_ptr(), secctx.secctx.as_ptr() as *const libc::c_void, secctx.secctx.len(), 0, ) }; let res_err = io::Error::last_os_error(); if res == 0 { Ok(()) } else { Err(res_err) } } pub fn open_root_node(&self) -> io::Result<()> { // We use `O_PATH` because we just want this for traversing the directory tree // and not for actually reading the contents. We don't use `open_relative_to()` // here because we are not opening a guest-provided pathname. Also, `self.cfg.root_dir` // is an absolute pathname, thus not relative to CWD, so we will not be able to open it // if "/" didn't change (e.g., chroot or pivot_root) let path_fd = openat( &libc::AT_FDCWD, self.cfg.root_dir.as_str(), libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC, )?; let st = statx(&path_fd, None)?; let handle = self.get_file_handle_opt(&path_fd, &st)?; let file_or_handle = if let Some(h) = handle.as_ref() { FileOrHandle::Handle(self.make_file_handle_openable(h)?) } else { FileOrHandle::File(path_fd) }; // Always keep the root node's migration info set (`InodeStore::clear_migration_info()` // will not clear it either); this way, whenever the filesystem is mounted (and this // function is called), we will have it set and can migrate it. // (Other nodes' migration info is set in `do_lookup()` when they are discovered during // migration.) let migration_info = match InodeMigrationInfo::new_root(&self.cfg, &file_or_handle) { Ok(mig_info) => Some(mig_info), Err(err) => { warn!( "Failed to construct migration information for the root node: {err}; \ may not be able to migrate" ); None } }; // Not sure why the root inode gets a refcount of 2 but that's what libfuse does. let inode = InodeData { inode: fuse::ROOT_ID, file_or_handle, refcount: AtomicU64::new(2), ids: InodeIds { ino: st.st.st_ino, dev: st.st.st_dev, mnt_id: st.mnt_id, }, mode: st.st.st_mode, migration_info: Mutex::new(migration_info), }; self.inodes.new_inode(inode)?; Ok(()) } /// After renaming an inode while preparing for migration, update its migration info if /// necessary. For example, when representing inodes through their filename and parent /// directory node, these must be updated to match the new name and location. /// `parent` and `filename` are the inode's new location. fn update_inode_migration_info( &self, parent_data: Arc, filename: &CStr, ) -> io::Result<()> { if !self.track_migration_info.load(Ordering::Relaxed) { // Not preparing for migration? Nothing to do. return Ok(()); } // We only need to update the node's migration info if we have it in our store if let Some(inode) = self.try_lookup(&parent_data, filename)? { let inode_data = inode.get(); let parent_strong_ref = StrongInodeReference::new_with_data(parent_data, &self.inodes)?; let mut info_locked = inode_data.migration_info.lock().unwrap(); // Unconditionally clear any potentially existing path, because it will be outdated if let Some(info) = info_locked.take() { if !info.has_path() { // We have some migration info, but it is not path-based? Keep it then. *info_locked = Some(info); return Ok(()); } } *info_locked = Some(InodeMigrationInfo::new( &self.cfg, parent_strong_ref, filename, &inode_data.file_or_handle, )?); } Ok(()) } /** * Prepare for a path to be invalidated (e.g. by overwriting or unlinking), which can be * relevant to migration. * * If there is an inode in our inode store on the given path, return it. If so, the caller * must call `after_invalidating_path()` after the invalidating operation is done. * * Background: When an inode's path is invalidated (while preparing for migration), e.g. * because it is unlinked or overwritten by a different inode, its migration info must too be * invalidated so we do not transmit a wrong path to the destination. While we must look for * the inode before the invalidating operation (lest it is gone), we must invalidate its * migration info only *after* that operation, or we’d have a TOCTTOU problem. Consider this * order of execution: * 1. We invalidate the path in the migration info (before executing the operation) * 2. Preserialization process runs in the background; before the operation is done, it finds * the still-existing (old) inode, re-setting the migration info's path to the one we just * cleared * 3. Operation executes, making that path point to a different inode * * Therefore, the invalidation process is split into two parts, `before_invalidating_path()` * and `after_invalidating_path()`. */ fn before_invalidating_path( &self, parent: &InodeData, filename: &CStr, ) -> Option { // Note that we have to do this unconditionally, regardless of the value of // `track_migration_info` -- same TOCTTOU problem as described in the comment above applies // (preserialization might start before the operation is done) self.try_lookup(parent, filename).ok().flatten() } /** * Counterpart to `before_invalidating_path()`; must be called after the path-invalidating * operation. * * If the inode’s migration info contains any path data, it is invalidated, and we * try to refresh it from /proc/self/fd: It’s possible that the operation failed (and so the * original path is still intact), or that the inode has another hard link left that the * kernel knows about (unlikely, but worth a try). * * `inode` is the reference returned by `before_invalidating_path()`, `old_parent` and * `old_filename` are used solely to generate a human-readable warning, as is `operation`, * which is a capitalized simple past verb form describing the operation (e.g. "Overwrote"). */ fn after_invalidating_path(&self, inode: StrongInodeReference, operation: &str) { let inode_data = inode.get(); let old_location = { let mut migration_info_locked = inode_data.migration_info.lock().unwrap(); let Some(migration_info) = migration_info_locked.take() else { // No migration info? Nothing to do then. return; }; if !migration_info.has_path() { // No path in the migration info? Nothing to invalidate then. *migration_info_locked = Some(migration_info); return; } migration_info.location }; match self.after_invalidating_path_refresh(inode_data) { Ok(()) => { if let Some(migration_info) = inode_data.migration_info.lock().unwrap().as_ref() { let new_location = &migration_info.location; info!("{operation} {old_location}, but found under {new_location}"); } else { warn!( "{operation} {old_location}, seem to have found new path, but lost it \ again; may be unable to migrate inode" ); } } Err(err) => warn!( "{operation} {old_location}, failed to get new path: {err}; will be unable to \ migrate inode" ), } } /** * Try to find a new path to the given inode. * * Used internally by `after_invalidating_path()`. Updates all migration info objects along * the path. */ fn after_invalidating_path_refresh(&self, inode_data: &InodeData) -> io::Result<()> { let shared_dir_path = self .inodes .get(fuse::ROOT_ID) .ok_or_else(|| other_io_error("Shared directory root node not found"))? .get_path(&self.proc_self_fd) .map_err(io::Error::from) .err_context(|| "Failed to get shared directory root path")?; preserialization::proc_paths::set_path_migration_info_from_proc_self_fd( inode_data, self, &shared_dir_path, ) .map_err(preserialization::proc_paths::WrappedError::into_inner) } /** * Temporarily changes the effective UID and GID. * * Changes the effective UID and GID to the one required by `ctx`, potentially including a * supplementary GID given in `extensions`. Return to the previous state once the returned * guard is dropped. */ fn unix_credentials_guard( &self, ctx: &Context, extensions: &Extensions, ) -> io::Result> { let host_uid = self.map_guest_uid(ctx.uid)?; let host_gid = self.map_guest_gid(ctx.gid)?; let supp_gid = extensions .sup_gid .map(|gid| self.map_guest_gid(gid)) .transpose()?; UnixCredentials::new(host_uid, host_gid) .supplementary_gid(self.sup_group_extension.load(Ordering::Relaxed), supp_gid) .set() } /// Translate `guest_uid` to a host UID using [`self.uid_map`](`Self#structfield.uid_map`). fn map_guest_uid(&self, guest_uid: GuestUid) -> io::Result { self.uid_map.map_guest(guest_uid).map_err(Into::into) } /// Translate `guest_gid` to a host GID using [`self.gid_map`](`Self#structfield.gid_map`). fn map_guest_gid(&self, guest_gid: GuestGid) -> io::Result { self.gid_map.map_guest(guest_gid).map_err(Into::into) } /// Translate `host_uid` to a guest UID using [`self.uid_map`](`Self#structfield.uid_map`). fn map_host_uid(&self, host_uid: HostUid) -> io::Result { self.uid_map.map_host(host_uid).map_err(Into::into) } /// Translate `host_gid` to a guest GID using [`self.gid_map`](`Self#structfield.gid_map`). fn map_host_gid(&self, host_gid: HostGid) -> io::Result { self.gid_map.map_host(host_gid).map_err(Into::into) } } impl FileSystem for PassthroughFs { type Inode = Inode; type Handle = Handle; type DirIter = ReadDir>; fn init(&self, capable: FsOptions) -> io::Result { // Force-wipe prior state in case someone "forgot" to send a DESTROY self.destroy(); self.open_root_node()?; // Note: On migration, all options negotiated here with the guest must be sent to the // destination in the `device_state::serialized::NegotiatedOpts` structure. So when adding // a new option here, don't forget to add it there, too, and handle it both in // `>::from()` and // `serialized::NegotiatedOpts::apply()`. let mut opts = if self.cfg.readdirplus { FsOptions::DO_READDIRPLUS | FsOptions::READDIRPLUS_AUTO } else { FsOptions::empty() }; if self.cfg.writeback && capable.contains(FsOptions::WRITEBACK_CACHE) { opts |= FsOptions::WRITEBACK_CACHE; self.writeback.store(true, Ordering::Relaxed); } if self.cfg.announce_submounts { if capable.contains(FsOptions::SUBMOUNTS) { self.announce_submounts.store(true, Ordering::Relaxed); } else { eprintln!("Warning: Cannot announce submounts, client does not support it"); } } if self.cfg.killpriv_v2 { if capable.contains(FsOptions::HANDLE_KILLPRIV_V2) { opts |= FsOptions::HANDLE_KILLPRIV_V2; } else { warn!("Cannot enable KILLPRIV_V2, client does not support it"); } } if self.cfg.posix_acl { let acl_required_flags = FsOptions::POSIX_ACL | FsOptions::DONT_MASK | FsOptions::SETXATTR_EXT; if capable.contains(acl_required_flags) { opts |= acl_required_flags; self.posix_acl.store(true, Ordering::Relaxed); debug!("init: enabling posix acl"); } else { error!("Cannot enable posix ACLs, client does not support it"); return Err(io::Error::from_raw_os_error(libc::EPROTO)); } } if self.cfg.security_label { if capable.contains(FsOptions::SECURITY_CTX) { opts |= FsOptions::SECURITY_CTX; } else { error!("Cannot enable security label. kernel does not support FUSE_SECURITY_CTX capability"); return Err(io::Error::from_raw_os_error(libc::EPROTO)); } } if self.cfg.allow_mmap { opts |= FsOptions::DIRECT_IO_ALLOW_MMAP; } if capable.contains(FsOptions::CREATE_SUPP_GROUP) { self.sup_group_extension.store(true, Ordering::Relaxed); } Ok(opts) } fn destroy(&self) { self.handles.write().unwrap().clear(); self.inodes.clear(); self.writeback.store(false, Ordering::Relaxed); self.announce_submounts.store(false, Ordering::Relaxed); self.posix_acl.store(false, Ordering::Relaxed); self.sup_group_extension.store(false, Ordering::Relaxed); } fn statfs(&self, _ctx: Context, inode: Inode) -> io::Result { let data = self.inodes.get(inode).ok_or_else(ebadf)?; let inode_file = data.get_file()?; let mut out = MaybeUninit::::zeroed(); // Safe because this will only modify `out` and we check the return value. let res = unsafe { libc::fstatvfs64(inode_file.as_raw_fd(), out.as_mut_ptr()) }; if res == 0 { // Safe because the kernel guarantees that `out` has been initialized. Ok(unsafe { out.assume_init() }) } else { Err(io::Error::last_os_error()) } } fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result { self.do_lookup(parent, name) } fn forget(&self, _ctx: Context, inode: Inode, count: u64) { self.inodes.forget_one(inode, count) } fn batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>) { self.inodes.forget_many(requests) } fn opendir( &self, _ctx: Context, inode: Inode, flags: u32, ) -> io::Result<(Option, OpenOptions)> { self.do_open(inode, false, flags | (libc::O_DIRECTORY as u32)) } fn releasedir( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, ) -> io::Result<()> { self.do_release(inode, handle) } fn mkdir( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, umask: u32, extensions: Extensions, ) -> io::Result { let data = self.inodes.get(parent).ok_or_else(ebadf)?; let parent_file = data.get_file()?; let invalidated_inode = self.before_invalidating_path(&data, name); let res = { let _credentials_guard = self.unix_credentials_guard(&ctx, &extensions)?; let _umask_guard = self .posix_acl .load(Ordering::Relaxed) .then(|| oslib::ScopedUmask::new(umask)); // Safe because this doesn't modify any memory and we check the return value. unsafe { libc::mkdirat(parent_file.as_raw_fd(), name.as_ptr(), mode) } }; if let Some(invalidated_inode) = invalidated_inode { self.after_invalidating_path(invalidated_inode, "Overwrote (via mkdir)"); } if res < 0 { return Err(io::Error::last_os_error()); } // Set security context on dir. if let Some(secctx) = extensions.secctx { if let Err(e) = self.do_mknod_mkdir_symlink_secctx(&parent_file, name, &secctx) { unsafe { libc::unlinkat(parent_file.as_raw_fd(), name.as_ptr(), libc::AT_REMOVEDIR); }; return Err(e); } } self.do_lookup(parent, name) } fn rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> { self.do_unlink(parent, name, libc::AT_REMOVEDIR) } fn readdir( &self, _ctx: Context, inode: Inode, handle: Handle, size: u32, offset: u64, ) -> io::Result { if size == 0 { return Ok(ReadDir::default()); } let data = self.find_handle(handle, inode)?; let buf = vec![0; size as usize]; // Since we are going to work with the kernel offset, we have to acquire the file // lock for both the `lseek64` and `getdents64` syscalls to ensure that no other // thread changes the kernel offset while we are using it. #[allow(clippy::readonly_write_lock)] let dir = data.file.get()?.write().unwrap(); ReadDir::new(&*dir, offset as libc::off64_t, buf) } fn open( &self, _ctx: Context, inode: Inode, kill_priv: bool, flags: u32, ) -> io::Result<(Option, OpenOptions)> { self.do_open(inode, kill_priv, flags) } fn release( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, _flush: bool, _flock_release: bool, _lock_owner: Option, ) -> io::Result<()> { self.do_release(inode, handle) } fn create( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, kill_priv: bool, flags: u32, umask: u32, extensions: Extensions, ) -> io::Result<(Entry, Option, OpenOptions)> { let data = self.inodes.get(parent).ok_or_else(ebadf)?; let parent_file = data.get_file()?; // We need to clean the `O_APPEND` flag in case the file is mem mapped or if the flag // is later modified in the guest using `fcntl(F_SETFL)`. We do a per-write `O_APPEND` // check setting `RWF_APPEND` for non-mmapped writes, if necessary. let create_flags = flags & !(libc::O_APPEND as u32); let fd = self.do_create( &ctx, &parent_file, name, mode, create_flags, umask, extensions, ); let (entry, handle) = match fd { Err(last_error) => { // Ignore the error if the file exists and O_EXCL is not present in `flags` match last_error.kind() { io::ErrorKind::AlreadyExists => { if (flags as i32 & libc::O_EXCL) != 0 { return Err(last_error); } } _ => return Err(last_error), } let entry = self.do_lookup(parent, name)?; let (handle, _) = self.do_open(entry.inode, kill_priv, flags)?; let handle = handle.ok_or_else(ebadf)?; (entry, handle) } Ok(fd) => { // Safe because we just opened this fd. let file = unsafe { File::from_raw_fd(fd) }; let entry = self.do_lookup(parent, name)?; let handle = self.next_handle.fetch_add(1, Ordering::Relaxed); let data = HandleData { inode: entry.inode, file: file.into(), migration_info: HandleMigrationInfo::new(flags as i32), }; self.handles.write().unwrap().insert(handle, Arc::new(data)); (entry, handle) } }; let mut opts = OpenOptions::empty(); match self.cfg.cache_policy { CachePolicy::Never => opts |= OpenOptions::DIRECT_IO, CachePolicy::Metadata => opts |= OpenOptions::DIRECT_IO, CachePolicy::Always => opts |= OpenOptions::KEEP_CACHE, _ => {} }; Ok((entry, Some(handle), opts)) } fn unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> { self.do_unlink(parent, name, 0) } fn read( &self, _ctx: Context, inode: Inode, handle: Handle, mut w: W, size: u32, offset: u64, _lock_owner: Option, _flags: u32, ) -> io::Result { let data = self.find_handle(handle, inode)?; // This is safe because read_from_file_at uses preadv64, so the underlying file descriptor // offset is not affected by this operation. let f = data.file.get()?.read().unwrap(); w.read_from_file_at(&f, size as usize, offset) } fn write( &self, _ctx: Context, inode: Inode, handle: Handle, mut r: R, size: u32, offset: u64, _lock_owner: Option, delayed_write: bool, kill_priv: bool, flags: u32, ) -> io::Result { let data = self.find_handle(handle, inode)?; // This is safe because write_to_file_at uses `pwritev2(2)`, so the underlying file // descriptor offset is not affected by this operation. let f = data.file.get()?.read().unwrap(); { let _killpriv_guard = if self.cfg.killpriv_v2 && kill_priv { // We need to drop FSETID during a write so that the kernel will remove setuid // or setgid bits from the file if it was written to by someone other than the // owner. drop_effective_cap("FSETID")? } else { None }; self.clear_file_capabilities(f.as_raw_fd(), false)?; // We don't set the `RWF_APPEND` (i.e., equivalent to `O_APPEND`) flag, if it's a // delayed write (i.e., using writeback mode or a mem mapped file) even if the file // was open in append mode, since the guest kernel sends the correct offset. // For non-delayed writes, we set the append mode, if necessary, to correctly handle // writes on a file shared among VMs. This case can only be handled correctly if the // write on the underlying file is performed in append mode. let is_append = flags & libc::O_APPEND as u32 != 0; let flags = (!delayed_write && is_append).then_some(oslib::WritevFlags::RWF_APPEND); r.write_to_file_at(&f, size as usize, offset, flags) } } fn getattr( &self, _ctx: Context, inode: Inode, _handle: Option, ) -> io::Result<(fuse::Attr, Duration)> { self.do_getattr(inode) } fn setattr( &self, _ctx: Context, inode: Inode, attr: fuse::SetattrIn, handle: Option, valid: SetattrValid, ) -> io::Result<(fuse::Attr, Duration)> { let inode_data = self.inodes.get(inode).ok_or_else(ebadf)?; // In this case, we need to open a new O_RDWR FD let rdwr_inode_file = handle.is_none() && valid.intersects(SetattrValid::SIZE); let inode_file = if rdwr_inode_file { inode_data.open_file(libc::O_NONBLOCK | libc::O_RDWR, &self.proc_self_fd)? } else { inode_data.get_file()? }; // `HandleData` is never read, but we need to keep a reference so its file is not dropped #[allow(dead_code)] enum Data { Handle(Arc, RawFd), ProcPath(CString), } // If we have a handle then use it otherwise get a new fd from the inode. let data = if let Some(handle) = handle { let hd = self.find_handle(handle, inode)?; let fd = hd.file.get()?.write().unwrap().as_raw_fd(); Data::Handle(hd, fd) } else { let pathname = CString::new(format!("{}", inode_file.as_raw_fd())) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; Data::ProcPath(pathname) }; if valid.contains(SetattrValid::MODE) { // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { match data { Data::Handle(_, fd) => libc::fchmod(fd, attr.mode), Data::ProcPath(ref p) => { libc::fchmodat(self.proc_self_fd.as_raw_fd(), p.as_ptr(), attr.mode, 0) } } }; if res < 0 { return Err(io::Error::last_os_error()); } } if valid.intersects(SetattrValid::UID | SetattrValid::GID) { let uid = if valid.contains(SetattrValid::UID) { self.map_guest_uid(attr.uid)?.into_inner() } else { // Cannot use -1 here because these are unsigned values. u32::MAX }; let gid = if valid.contains(SetattrValid::GID) { self.map_guest_gid(attr.gid)?.into_inner() } else { // Cannot use -1 here because these are unsigned values. u32::MAX }; self.clear_file_capabilities(inode_file.as_raw_fd(), true)?; // Safe because this is a constant value and a valid C string. let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::fchownat( inode_file.as_raw_fd(), empty.as_ptr(), uid, gid, libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW, ) }; if res < 0 { return Err(io::Error::last_os_error()); } } if valid.contains(SetattrValid::SIZE) { let fd = match data { Data::Handle(_, fd) => fd, _ => { // Should have opened an O_RDWR inode_file above assert!(rdwr_inode_file); inode_file.as_raw_fd() } }; let _killpriv_guard = if self.cfg.killpriv_v2 && valid.contains(SetattrValid::KILL_SUIDGID) { drop_effective_cap("FSETID")? } else { None }; // Safe because this doesn't modify any memory and we check the return value. let res = self .clear_file_capabilities(fd, false) .map(|_| unsafe { libc::ftruncate(fd, attr.size as i64) })?; if res < 0 { return Err(io::Error::last_os_error()); } } if valid.intersects(SetattrValid::ATIME | SetattrValid::MTIME) { let mut tvs = [ libc::timespec { tv_sec: 0, tv_nsec: libc::UTIME_OMIT, }, libc::timespec { tv_sec: 0, tv_nsec: libc::UTIME_OMIT, }, ]; if valid.contains(SetattrValid::ATIME_NOW) { tvs[0].tv_nsec = libc::UTIME_NOW; } else if valid.contains(SetattrValid::ATIME) { tvs[0].tv_sec = attr.atime as i64; tvs[0].tv_nsec = attr.atimensec.into(); } if valid.contains(SetattrValid::MTIME_NOW) { tvs[1].tv_nsec = libc::UTIME_NOW; } else if valid.contains(SetattrValid::MTIME) { tvs[1].tv_sec = attr.mtime as i64; tvs[1].tv_nsec = attr.mtimensec.into(); } // Safe because this doesn't modify any memory and we check the return value. let res = match data { Data::Handle(_, fd) => unsafe { libc::futimens(fd, tvs.as_ptr()) }, Data::ProcPath(ref p) => unsafe { libc::utimensat(self.proc_self_fd.as_raw_fd(), p.as_ptr(), tvs.as_ptr(), 0) }, }; if res < 0 { return Err(io::Error::last_os_error()); } } self.do_getattr(inode) } fn rename( &self, _ctx: Context, olddir: Inode, oldname: &CStr, newdir: Inode, newname: &CStr, flags: u32, ) -> io::Result<()> { let old_inode = self.inodes.get(olddir).ok_or_else(ebadf)?; let new_inode = self.inodes.get(newdir).ok_or_else(ebadf)?; let old_file = old_inode.get_file()?; let new_file = new_inode.get_file()?; let invalidated_inode = self.before_invalidating_path(&new_inode, newname); // Safe because this doesn't modify any memory and we check the return value. // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands // and we have glibc 2.28. let res = unsafe { libc::syscall( libc::SYS_renameat2, old_file.as_raw_fd(), oldname.as_ptr(), new_file.as_raw_fd(), newname.as_ptr(), flags, ) }; if let Some(invalidated_inode) = invalidated_inode { self.after_invalidating_path(invalidated_inode, "Overwrote (via rename)"); } if res != 0 { return Err(io::Error::last_os_error()); } if let Err(err) = self.update_inode_migration_info(new_inode, newname) { warn!( "Failed to update renamed file's ({oldname:?} -> {newname:?}) migration info, \ the migration destination may be unable to find it: {err}", ); } Ok(()) } fn mknod( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, rdev: u32, umask: u32, extensions: Extensions, ) -> io::Result { let data = self.inodes.get(parent).ok_or_else(ebadf)?; let parent_file = data.get_file()?; let invalidated_inode = self.before_invalidating_path(&data, name); let res = { let _credentials_guard = self.unix_credentials_guard(&ctx, &extensions)?; let _umask_guard = self .posix_acl .load(Ordering::Relaxed) .then(|| oslib::ScopedUmask::new(umask)); // Safe because this doesn't modify any memory and we check the return value. unsafe { libc::mknodat( parent_file.as_raw_fd(), name.as_ptr(), mode as libc::mode_t, u64::from(rdev), ) } }; if let Some(invalidated_inode) = invalidated_inode { self.after_invalidating_path(invalidated_inode, "Overwrote (via mknod)"); } if res < 0 { return Err(io::Error::last_os_error()); } // Set security context on node. if let Some(secctx) = extensions.secctx { if let Err(e) = self.do_mknod_mkdir_symlink_secctx(&parent_file, name, &secctx) { unsafe { libc::unlinkat(parent_file.as_raw_fd(), name.as_ptr(), 0); }; return Err(e); } } self.do_lookup(parent, name) } fn link( &self, _ctx: Context, inode: Inode, newparent: Inode, newname: &CStr, ) -> io::Result { let data = self.inodes.get(inode).ok_or_else(ebadf)?; let new_inode = self.inodes.get(newparent).ok_or_else(ebadf)?; let inode_file = data.get_file()?; let newparent_file = new_inode.get_file()?; let procname = CString::new(format!("{}", inode_file.as_raw_fd())) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; let invalidated_inode = self.before_invalidating_path(&new_inode, newname); // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::linkat( self.proc_self_fd.as_raw_fd(), procname.as_ptr(), newparent_file.as_raw_fd(), newname.as_ptr(), libc::AT_SYMLINK_FOLLOW, ) }; if let Some(invalidated_inode) = invalidated_inode { self.after_invalidating_path(invalidated_inode, "Overwrote (via link)"); } if res == 0 { self.do_lookup(newparent, newname) } else { Err(io::Error::last_os_error()) } } fn symlink( &self, ctx: Context, linkname: &CStr, parent: Inode, name: &CStr, extensions: Extensions, ) -> io::Result { let data = self.inodes.get(parent).ok_or_else(ebadf)?; let parent_file = data.get_file()?; let invalidated_inode = self.before_invalidating_path(&data, name); let res = { let _credentials_guard = self.unix_credentials_guard(&ctx, &extensions)?; // Safe because this doesn't modify any memory and we check the return value. unsafe { libc::symlinkat(linkname.as_ptr(), parent_file.as_raw_fd(), name.as_ptr()) } }; if let Some(invalidated_inode) = invalidated_inode { self.after_invalidating_path(invalidated_inode, "Overwrote (via symlink)"); } if res < 0 { return Err(io::Error::last_os_error()); } // Set security context on symlink. if let Some(secctx) = extensions.secctx { if let Err(e) = self.do_mknod_mkdir_symlink_secctx(&parent_file, name, &secctx) { unsafe { libc::unlinkat(parent_file.as_raw_fd(), name.as_ptr(), 0); }; return Err(e); } } self.do_lookup(parent, name) } fn readlink(&self, _ctx: Context, inode: Inode) -> io::Result> { let data = self.inodes.get(inode).ok_or_else(ebadf)?; let inode_file = data.get_file()?; let mut buf = vec![0; libc::PATH_MAX as usize]; // Safe because this is a constant value and a valid C string. let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; // Safe because this will only modify the contents of `buf` and we check the return value. let res = unsafe { libc::readlinkat( inode_file.as_raw_fd(), empty.as_ptr(), buf.as_mut_ptr() as *mut libc::c_char, buf.len(), ) }; if res < 0 { return Err(io::Error::last_os_error()); } buf.resize(res as usize, 0); Ok(buf) } fn flush( &self, _ctx: Context, inode: Inode, handle: Handle, _lock_owner: u64, ) -> io::Result<()> { let data = self.find_handle(handle, inode)?; // Since this method is called whenever an fd is closed in the client, we can emulate that // behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe // because this doesn't modify any memory and we check the return values. unsafe { let newfd = libc::dup(data.file.get()?.write().unwrap().as_raw_fd()); if newfd < 0 { return Err(io::Error::last_os_error()); } if libc::close(newfd) < 0 { Err(io::Error::last_os_error()) } else { Ok(()) } } } fn fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> { let data = self.find_handle(handle, inode)?; let fd = data.file.get()?.write().unwrap().as_raw_fd(); // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { if datasync { libc::fdatasync(fd) } else { libc::fsync(fd) } }; if res == 0 { Ok(()) } else { Err(io::Error::last_os_error()) } } fn fsyncdir( &self, ctx: Context, inode: Inode, datasync: bool, handle: Handle, ) -> io::Result<()> { self.fsync(ctx, inode, datasync, handle) } fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> { let data = self.inodes.get(inode).ok_or_else(ebadf)?; let inode_file = data.get_file()?; let st = statx(&inode_file, None)?.st; let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK); if mode == libc::F_OK { // The file exists since we were able to call `stat(2)` on it. return Ok(()); } // We use ctx.uid/ctx.gid for these checks, but when idmapped mounts // support is enabled on the guest side, it means that "default_permissions" // flag is set on virtiofs mount and FUSE_ACCESS request should never be // sent to the userspace. Please, refer to the kernel commit // ("fs/fuse: warn if fuse_access is called when idmapped mounts are allowed"). // In case when idmapped mounts are not enabled we are good to rely on ctx.uid/ctx.gid values. let st_uid = self.map_host_uid(HostUid::from(st.st_uid))?; let st_gid = self.map_host_gid(HostGid::from(st.st_gid))?; if (mode & libc::R_OK) != 0 && !ctx.uid.is_root() && (st_uid != ctx.uid || st.st_mode & 0o400 == 0) && (st_gid != ctx.gid || st.st_mode & 0o040 == 0) && st.st_mode & 0o004 == 0 { return Err(io::Error::from_raw_os_error(libc::EACCES)); } if (mode & libc::W_OK) != 0 && !ctx.uid.is_root() && (st_uid != ctx.uid || st.st_mode & 0o200 == 0) && (st_gid != ctx.gid || st.st_mode & 0o020 == 0) && st.st_mode & 0o002 == 0 { return Err(io::Error::from_raw_os_error(libc::EACCES)); } // root can only execute something if it is executable by one of the owner, the group, or // everyone. if (mode & libc::X_OK) != 0 && (!ctx.uid.is_root() || st.st_mode & 0o111 == 0) && (st_uid != ctx.uid || st.st_mode & 0o100 == 0) && (st_gid != ctx.gid || st.st_mode & 0o010 == 0) && st.st_mode & 0o001 == 0 { return Err(io::Error::from_raw_os_error(libc::EACCES)); } Ok(()) } fn setxattr( &self, _ctx: Context, inode: Inode, name: &CStr, value: &[u8], flags: u32, extra_flags: SetxattrFlags, ) -> io::Result<()> { if !self.cfg.xattr { return Err(io::Error::from_raw_os_error(libc::ENOSYS)); } let data = self.inodes.get(inode).ok_or_else(ebadf)?; let name = self.map_client_xattrname(name)?; // If we are setting posix access acl and if SGID needs to be // cleared. Let's do it explicitly by calling a chmod() syscall. let xattr_name = name.as_ref().to_str().unwrap(); let must_clear_sgid = self.posix_acl.load(Ordering::Relaxed) && extra_flags.contains(SetxattrFlags::SETXATTR_ACL_KILL_SGID) && xattr_name.eq("system.posix_acl_access"); let res = if is_safe_inode(data.mode) { // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we // need to get a new fd. let file = self.open_inode(inode, libc::O_RDONLY | libc::O_NONBLOCK)?; self.clear_file_capabilities(file.as_raw_fd(), false)?; if must_clear_sgid { self.clear_sgid(&file, false)?; } // Safe because this doesn't modify any memory and we check the return value. unsafe { libc::fsetxattr( file.as_raw_fd(), name.as_ptr(), value.as_ptr() as *const libc::c_void, value.len(), flags as libc::c_int, ) } } else { let file = data.get_file()?; self.clear_file_capabilities(file.as_raw_fd(), true)?; if must_clear_sgid { self.clear_sgid(&file, true)?; } let procname = CString::new(format!("{}", file.as_raw_fd())) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; let _working_dir_guard = set_working_directory(self.proc_self_fd.as_raw_fd(), self.root_fd.as_raw_fd()); // Safe because this doesn't modify any memory and we check the return value. unsafe { libc::setxattr( procname.as_ptr(), name.as_ptr(), value.as_ptr() as *const libc::c_void, value.len(), flags as libc::c_int, ) } }; if res == 0 { Ok(()) } else { Err(io::Error::last_os_error()) } } fn getxattr( &self, _ctx: Context, inode: Inode, name: &CStr, size: u32, ) -> io::Result { if !self.cfg.xattr { return Err(io::Error::from_raw_os_error(libc::ENOSYS)); } let mut buf = vec![0; size as usize]; let name = self.map_client_xattrname(name).map_err(|e| { if e.kind() == ErrorKind::PermissionDenied { io::Error::from_raw_os_error(libc::ENODATA) } else { e } })?; let data = self.inodes.get(inode).ok_or_else(ebadf)?; let res = if is_safe_inode(data.mode) { // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we // need to get a new fd. let file = self.open_inode(inode, libc::O_RDONLY | libc::O_NONBLOCK)?; // Safe because this will only modify the contents of `buf`. unsafe { libc::fgetxattr( file.as_raw_fd(), name.as_ptr(), buf.as_mut_ptr() as *mut libc::c_void, size as libc::size_t, ) } } else { let file = data.get_file()?; let procname = CString::new(format!("{}", file.as_raw_fd())) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; let _working_dir_guard = set_working_directory(self.proc_self_fd.as_raw_fd(), self.root_fd.as_raw_fd()); // Safe because this will only modify the contents of `buf`. unsafe { libc::getxattr( procname.as_ptr(), name.as_ptr(), buf.as_mut_ptr() as *mut libc::c_void, size as libc::size_t, ) } }; if res < 0 { return Err(io::Error::last_os_error()); } if size == 0 { Ok(GetxattrReply::Count(res as u32)) } else { buf.resize(res as usize, 0); Ok(GetxattrReply::Value(buf)) } } fn listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result { if !self.cfg.xattr { return Err(io::Error::from_raw_os_error(libc::ENOSYS)); } let data = self.inodes.get(inode).ok_or_else(ebadf)?; let mut buf = vec![0; size as usize]; let res = if is_safe_inode(data.mode) { // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we // need to get a new fd. let file = self.open_inode(inode, libc::O_RDONLY | libc::O_NONBLOCK)?; // Safe because this will only modify the contents of `buf`. unsafe { libc::flistxattr( file.as_raw_fd(), buf.as_mut_ptr() as *mut libc::c_char, size as libc::size_t, ) } } else { let file = data.get_file()?; let procname = CString::new(format!("{}", file.as_raw_fd())) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; let _working_dir_guard = set_working_directory(self.proc_self_fd.as_raw_fd(), self.root_fd.as_raw_fd()); // Safe because this will only modify the contents of `buf`. unsafe { libc::listxattr( procname.as_ptr(), buf.as_mut_ptr() as *mut libc::c_char, size as libc::size_t, ) } }; if res < 0 { return Err(io::Error::last_os_error()); } if size == 0 { Ok(ListxattrReply::Count(res as u32)) } else { buf.resize(res as usize, 0); let buf = self.map_server_xattrlist(buf); Ok(ListxattrReply::Names(buf)) } } fn removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> { if !self.cfg.xattr { return Err(io::Error::from_raw_os_error(libc::ENOSYS)); } let data = self.inodes.get(inode).ok_or_else(ebadf)?; let name = self.map_client_xattrname(name)?; let res = if is_safe_inode(data.mode) { // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we // need to get a new fd. let file = self.open_inode(inode, libc::O_RDONLY | libc::O_NONBLOCK)?; // Safe because this doesn't modify any memory and we check the return value. unsafe { libc::fremovexattr(file.as_raw_fd(), name.as_ptr()) } } else { let file = data.get_file()?; let procname = CString::new(format!("{}", file.as_raw_fd())) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; let _working_dir_guard = set_working_directory(self.proc_self_fd.as_raw_fd(), self.root_fd.as_raw_fd()); // Safe because this doesn't modify any memory and we check the return value. unsafe { libc::removexattr(procname.as_ptr(), name.as_ptr()) } }; if res == 0 { Ok(()) } else { Err(io::Error::last_os_error()) } } fn fallocate( &self, _ctx: Context, inode: Inode, handle: Handle, mode: u32, offset: u64, length: u64, ) -> io::Result<()> { let data = self.find_handle(handle, inode)?; let fd = data.file.get()?.write().unwrap().as_raw_fd(); // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::fallocate64( fd, mode as libc::c_int, offset as libc::off64_t, length as libc::off64_t, ) }; if res == 0 { Ok(()) } else { Err(io::Error::last_os_error()) } } fn lseek( &self, _ctx: Context, inode: Inode, handle: Handle, offset: u64, whence: u32, ) -> io::Result { let data = self.find_handle(handle, inode)?; let fd = data.file.get()?.write().unwrap().as_raw_fd(); // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::lseek(fd, offset as libc::off64_t, whence as libc::c_int) }; if res < 0 { Err(io::Error::last_os_error()) } else { Ok(res as u64) } } fn copyfilerange( &self, _ctx: Context, inode_in: Inode, handle_in: Handle, offset_in: u64, inode_out: Inode, handle_out: Handle, offset_out: u64, len: u64, flags: u64, ) -> io::Result { let data_in = self.find_handle(handle_in, inode_in)?; // Take just a read lock as we're not going to alter the file descriptor offset. let fd_in = data_in.file.get()?.read().unwrap().as_raw_fd(); let data_out = self.find_handle(handle_out, inode_out)?; // Take just a read lock as we're not going to alter the file descriptor offset. let fd_out = data_out.file.get()?.read().unwrap().as_raw_fd(); // Safe because this will only modify `offset_in` and `offset_out` and we check // the return value. let res = unsafe { libc::syscall( libc::SYS_copy_file_range, fd_in, &mut (offset_in as i64) as &mut _ as *mut _, fd_out, &mut (offset_out as i64) as &mut _ as *mut _, len, flags, ) }; if res < 0 { Err(io::Error::last_os_error()) } else { Ok(res as usize) } } fn syncfs(&self, _ctx: Context, inode: Inode) -> io::Result<()> { // TODO: Branch here depending on whether virtiofsd announces submounts or not. let file = self.open_inode(inode, libc::O_RDONLY | libc::O_NOFOLLOW)?; let raw_fd = file.as_raw_fd(); debug!("syncfs: inode={}, mount_fd={}", inode, raw_fd); let ret = unsafe { libc::syncfs(raw_fd) }; if ret != 0 { // Thread-safe, because errno is stored in thread-local storage. Err(io::Error::last_os_error()) } else { Ok(()) } } } impl HandleDataFile { fn get(&self) -> io::Result<&'_ RwLock> { match self { HandleDataFile::File(file) => Ok(file), HandleDataFile::Invalid(err) => Err(io::Error::new( err.kind(), format!("Handle is invalid because of an error during the preceding migration, which was: {err}"), )), } } } impl From for HandleDataFile { fn from(file: File) -> Self { HandleDataFile::File(RwLock::new(file)) } } virtiofsd-1.13.0/src/passthrough/mount_fd.rs000064400000000000000000000452551046102023000172510ustar 00000000000000// Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. use crate::passthrough::stat::{statx, MountId}; use crate::passthrough::util::openat; use crate::util::ResultErrorContext; use std::collections::{HashMap, HashSet}; use std::ffi::CString; use std::fs::File; use std::io::{self, Read, Seek}; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use std::sync::{Arc, Mutex, RwLock, Weak}; pub struct MountFd { map: Weak>>>, mount_id: MountId, file: File, } /// This type maintains a map where each entry maps a mount ID to an open FD on that mount. Other /// code can request an `Arc` for any mount ID. A key gets added to the map, when the /// first `Arc` for that mount ID is requested. A key gets removed from the map, when the /// last `Arc` for that mount ID is dropped. That is, map entries are reference-counted /// and other code can keep an entry in the map by holding on to an `Arc`. /// /// We currently have one use case for `MountFds`: /// /// 1. Creating a file handle only returns a mount ID, but opening a file handle requires an open FD /// on the respective mount. So we look that up in the map. pub struct MountFds { map: Arc>>>, /// /proc/self/mountinfo mountinfo: Mutex, /// An optional prefix to strip from all mount points in mountinfo mountprefix: Option, /// Set of filesystems for which we have already logged file handle errors error_logged: Arc>>, } impl MountFd { /** * Create a new mount FD for the given `path` relative to `dir`. * * Its mount ID is taken from `statx()`. If `mount_fds` is given, the mount FD is entered * there; unless `mount_fds` already contains a mount FD for this mount ID, in which case that * FD is returned instead of creating a new one. * * The use case for this is migration: The migration source sends us a mapping of its mount * IDs to paths, so this function turns those paths into `MountFd` objects that can be used * for `SerializableFileHandle::to_openable()`. (Note that those mount IDs are only valid on * the migration source, not here, on the destination; that’s why the `MountFd` object’s mount * ID is taken from `statx()` instead of using the source’s ID.) */ pub fn new( mount_fds: Option<&MountFds>, dir: &D, path: &str, ) -> io::Result> { // Not documented in the man page, but mount FDs must be opened with `O_RDONLY` (not just // `O_PATH`) let file = openat(dir, path, libc::O_RDONLY).err_context(|| format!("Failed to open {path}"))?; let st = statx(&file, None).err_context(|| format!("Failed to get {path}'s mount ID"))?; if let Some(mount_fds) = mount_fds { let mut mfds_locked = mount_fds.map.write().unwrap(); // Same as in `MountFds::get()`: If there is an entry but upgrade fails, treat it as // non-existent. Overwriting it is safe because `MountFd::drop()` only removes // `MountFds` entries that have a refcount of 0. if let Some(mount_fd) = mfds_locked.get(&st.mnt_id).and_then(Weak::upgrade) { return Ok(mount_fd); } let mount_fd = Arc::new(MountFd { map: Arc::downgrade(&mount_fds.map), mount_id: st.mnt_id, file, }); mfds_locked.insert(st.mnt_id, Arc::downgrade(&mount_fd)); Ok(mount_fd) } else { Ok(Arc::new(MountFd { map: Weak::new(), mount_id: st.mnt_id, file, })) } } pub fn file(&self) -> &File { &self.file } /// Get the associated mount ID. pub fn mount_id(&self) -> MountId { self.mount_id } } /** * Error object (to be used as `Result`) for mount-point-related errors (hence MPR). * Includes a description (that is auto-generated from the `io::Error` at first), which can be * overridden with `MPRError::set_desc()`, or given a prefix with `MPRError::prefix()`. * * The full description can be retrieved through the `Display` trait implementation (or the * auto-derived `ToString`). * * `MPRError` objects should generally be logged at some point, because they may indicate an error * in the user's configuration or a bug in virtiofsd. However, we only want to log them once per * filesystem, and so they can be silenced (setting `silent` to true if we know that we have * already logged an error for the respective filesystem) and then should not be logged. * * Naturally, a "mount-point-related" error should be associated with some mount point, which is * reflected in `fs_mount_id` and `fs_mount_root`. Setting these values will improve the error * description, because the `Display` implementation will prepend these values to the returned * string. * * To achieve this association, `MPRError` objects should be created through * `MountFds::error_for()`, which obtains the mount root path for the given mount ID, and will thus * try to not only set `fs_mount_id`, but `fs_mount_root` also. `MountFds::error_for()` will also * take care to set `silent` as appropriate. * * (Sometimes, though, we know an error is associated with a mount point, but we do not know with * which one. That is why the `fs_mount_id` field is optional.) */ #[derive(Debug)] pub struct MPRError { io: io::Error, description: String, silent: bool, fs_mount_id: Option, fs_mount_root: Option, } /// Type alias for convenience pub type MPRResult = Result; impl Drop for MountFd { fn drop(&mut self) { debug!( "Dropping MountFd: mount_id={}, mount_fd={}", self.mount_id, self.file.as_raw_fd(), ); // If `self.map.upgrade()` fails, then the `MountFds` structure was dropped while there was // still an `Arc` alive. In this case, we don't need to remove it from the map, // because the map doesn't exist anymore. if let Some(map) = self.map.upgrade() { let mut map = map.write().unwrap(); // After the refcount reaches zero and before we lock the map, there's a window where // the value can be concurrently replaced by a `Weak` pointer to a new `MountFd`. // Therefore, only remove the value if the refcount in the map is zero, too. if let Some(0) = map.get(&self.mount_id).map(Weak::strong_count) { map.remove(&self.mount_id); } } } } impl> From for MPRError { /// Convert any stringifyable error object that can be converted to an `io::Error` to an /// `MPRError`. Note that `fs_mount_id` and `fs_mount_root` are not set, so this `MPRError` /// object is not associated with any mount point. /// The initial description is taken from the original error object. fn from(err: E) -> Self { let description = err.to_string(); MPRError { io: err.into(), description, silent: false, fs_mount_id: None, fs_mount_root: None, } } } impl MPRError { /// Override the current description #[must_use] pub fn set_desc(mut self, s: String) -> Self { self.description = s; self } /// Add a prefix to the description #[must_use] pub fn prefix(self, s: String) -> Self { let new_desc = format!("{}: {}", s, self.description); self.set_desc(new_desc) } /// To give additional information to the user (when this error is logged), add the mount ID of /// the filesystem associated with this error #[must_use] fn set_mount_id(mut self, mount_id: MountId) -> Self { self.fs_mount_id = Some(mount_id); self } /// To give additional information to the user (when this error is logged), add the mount root /// path for the filesystem associated with this error #[must_use] fn set_mount_root(mut self, mount_root: String) -> Self { self.fs_mount_root = Some(mount_root); self } /// Mark this error as silent (i.e. not to be logged) #[must_use] fn silence(mut self) -> Self { self.silent = true; self } /// Return whether this error is silent (i.e. should not be logged) pub fn silent(&self) -> bool { self.silent } /// Return the `io::Error` from an `MPRError` and drop the rest pub fn into_inner(self) -> io::Error { self.io } } impl std::fmt::Display for MPRError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match (self.fs_mount_id, &self.fs_mount_root) { (None, None) => write!(f, "{}", self.description), (Some(id), None) => write!(f, "Filesystem with mount ID {}: {}", id, self.description), (None, Some(root)) => write!( f, "Filesystem mounted on \"{}\": {}", root, self.description ), (Some(id), Some(root)) => write!( f, "Filesystem mounted on \"{}\" (mount ID: {}): {}", root, id, self.description ), } } } impl std::error::Error for MPRError {} impl MountFds { pub fn new(mountinfo: File, mountprefix: Option) -> Self { MountFds { map: Default::default(), mountinfo: Mutex::new(mountinfo), mountprefix, error_logged: Default::default(), } } pub fn get(&self, mount_id: MountId, reopen_fd: F) -> MPRResult> where F: FnOnce(RawFd, libc::c_int) -> io::Result, { let existing_mount_fd = self .map // The `else` branch below (where `existing_mount_fd` matches `None`) takes a write lock // to insert a new mount FD into the hash map. This doesn't deadlock, because the read // lock taken here doesn't have its lifetime extended beyond the statement, because // `Weak::upgrade` returns a new pointer and not a reference into the read lock. .read() .unwrap() .get(&mount_id) // We treat a failed upgrade just like a non-existent key, because it means that all // strong references to the `MountFd` have disappeared, so it's in the process of being // dropped, but `MountFd::drop()` just did not yet get to remove it from the map. .and_then(Weak::upgrade); let mount_fd = if let Some(mount_fd) = existing_mount_fd { mount_fd } else { // `open_by_handle_at()` needs a non-`O_PATH` fd, which we will need to open here. We // are going to open the filesystem's mount point, but we do not know whether that is a // special file[1], and we must not open special files with anything but `O_PATH`, so // we have to get some `O_PATH` fd first that we can stat to find out whether it is // safe to open. // [1] While mount points are commonly directories, it is entirely possible for a // filesystem's root inode to be a regular or even special file. let mount_point = self.get_mount_root(mount_id)?; // Clone `mount_point` so we can still use it in error messages let c_mount_point = CString::new(mount_point.clone()).map_err(|e| { self.error_for(mount_id, e) .prefix(format!("Failed to convert \"{mount_point}\" to a CString")) })?; let mount_point_fd = unsafe { libc::open(c_mount_point.as_ptr(), libc::O_PATH) }; if mount_point_fd < 0 { return Err(self .error_for(mount_id, io::Error::last_os_error()) .prefix(format!("Failed to open mount point \"{mount_point}\""))); } // Safe because we have just opened this FD let mount_point_path = unsafe { File::from_raw_fd(mount_point_fd) }; // Ensure that `mount_point_path` refers to an inode with the mount ID we need let stx = statx(&mount_point_path, None).map_err(|e| { self.error_for(mount_id, e) .prefix(format!("Failed to stat mount point \"{mount_point}\"")) })?; if stx.mnt_id != mount_id { return Err(self .error_for(mount_id, io::Error::from_raw_os_error(libc::EIO)) .set_desc(format!( "Mount point's ({}) mount ID ({}) does not match expected value ({})", mount_point, stx.mnt_id, mount_id ))); } // Ensure that we can safely reopen `mount_point_path` with `O_RDONLY` let file_type = stx.st.st_mode & libc::S_IFMT; if file_type != libc::S_IFREG && file_type != libc::S_IFDIR { return Err(self .error_for(mount_id, io::Error::from_raw_os_error(libc::EIO)) .set_desc(format!( "Mount point \"{mount_point}\" is not a regular file or directory" ))); } // Now that we know that this is a regular file or directory, really open it let file = reopen_fd( mount_point_path.as_raw_fd(), libc::O_RDONLY | libc::O_NOFOLLOW | libc::O_CLOEXEC, ) .map_err(|e| { self.error_for(mount_id, e).prefix(format!( "Failed to reopen mount point \"{mount_point}\" for reading" )) })?; let mut mount_fds_locked = self.map.write().unwrap(); // As above: by calling `and_then(Weak::upgrade)`, we treat a failed upgrade just like a // non-existent key. If the key exists but upgrade fails, then `HashMap::insert()` // below will update the value. `MountFd::drop()` takes care to only remove a `MountFd` // without strong references from the map, and hence will not touch the updated one. if let Some(mount_fd) = mount_fds_locked.get(&mount_id).and_then(Weak::upgrade) { // A mount FD was added concurrently while we did not hold a lock on // `mount_fds.map` -- use that entry (`file` will be dropped). mount_fd } else { debug!( "Creating MountFd: mount_id={}, mount_fd={}", mount_id, file.as_raw_fd(), ); let mount_fd = Arc::new(MountFd { map: Arc::downgrade(&self.map), mount_id, file, }); mount_fds_locked.insert(mount_id, Arc::downgrade(&mount_fd)); mount_fd } }; Ok(mount_fd) } /// Given a mount ID, return the mount root path (by reading `/proc/self/mountinfo`) pub fn get_mount_root(&self, mount_id: MountId) -> MPRResult { let mountinfo = { let mountinfo_file = &mut *self.mountinfo.lock().unwrap(); mountinfo_file.rewind().map_err(|e| { self.error_for_nolookup(mount_id, e) .prefix("Failed to access /proc/self/mountinfo".into()) })?; let mut mountinfo = String::new(); mountinfo_file.read_to_string(&mut mountinfo).map_err(|e| { self.error_for_nolookup(mount_id, e) .prefix("Failed to read /proc/self/mountinfo".into()) })?; mountinfo }; let path = mountinfo.split('\n').find_map(|line| { let mut columns = line.split(char::is_whitespace); if columns.next()?.parse::().ok()? != mount_id { return None; } // Skip parent mount ID, major:minor device ID, and the root within the filesystem // (to get to the mount path) columns.nth(3) }); match path { Some(p) => { let p = String::from(p); if let Some(prefix) = self.mountprefix.as_ref() { if let Some(suffix) = p.strip_prefix(prefix).filter(|s| !s.is_empty()) { Ok(suffix.into()) } else { // The shared directory is the mount point (strip_prefix() returned "") or // mount is outside the shared directory, so it must be the mount the root // directory is on Ok("/".into()) } } else { Ok(p) } } None => Err(self .error_for_nolookup(mount_id, io::Error::from_raw_os_error(libc::EINVAL)) .set_desc(format!("Failed to find mount root for mount ID {mount_id}"))), } } /// Generate an `MPRError` object for the given `mount_id`, and silence it if we have already /// generated such an object for that `mount_id`. /// (Called `..._nolookup`, because in contrast to `MountFds::error_for()`, this method will /// not try to look up the respective mount root path, and so is safe to call when such a /// lookup would be unwise.) fn error_for_nolookup>( &self, mount_id: MountId, err: E, ) -> MPRError { let err = MPRError::from(err).set_mount_id(mount_id); if self.error_logged.read().unwrap().contains(&mount_id) { err.silence() } else { self.error_logged.write().unwrap().insert(mount_id); err } } /// Call `self.error_for_nolookup()`, and if the `MPRError` object is not silenced, try to /// obtain the mount root path for the given `mount_id` and add it to the error object. /// (Note: DO NOT call this method from `MountFds::get_mount_root()`, because that may lead to /// an infinite loop.) pub fn error_for>(&self, mount_id: MountId, err: E) -> MPRError { let err = self.error_for_nolookup(mount_id, err); if err.silent() { // No need to add more information err } else { // This just adds some information, so ignore errors if let Ok(mount_root) = self.get_mount_root(mount_id) { err.set_mount_root(mount_root) } else { err } } } } virtiofsd-1.13.0/src/passthrough/read_only.rs000064400000000000000000000362601046102023000174060ustar 00000000000000// Copyright 2024 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. //! Implementation of a read-only variant of [`PassthroughFs`]. //! //! Implements a wrapper around [`PassthroughFs`] ([`PassthroughFsRo`]) that prohibits all //! operations that would modify anything within the shared directory. This wrapper implements the //! [`FileSystem`] and [`SerializableFileSystem`] traits, so can be used as a virtiofsd filesystem //! driver. use super::util::{einval, erofs}; use super::PassthroughFs; use crate::filesystem::{ Context, Entry, Extensions, FileSystem, FsOptions, GetxattrReply, ListxattrReply, OpenOptions, SerializableFileSystem, SetattrValid, SetxattrFlags, ZeroCopyReader, ZeroCopyWriter, }; use crate::fuse; use std::convert::TryInto; use std::ffi::CStr; use std::fs::File; use std::io; use std::sync::atomic::AtomicBool; use std::sync::Arc; use std::time::Duration; /// Wrapper around `PassthroughFs`, prohibiting modifications. /// /// Prevent any operation that would modify the underlying filesystem. pub struct PassthroughFsRo(PassthroughFs); impl PassthroughFsRo { /// Create a `PassthroughFsRo` filesystem. /// /// Internally creates a `PassthroughFs` filesystem using the `cfg` configuration, then wraps /// it in the `PassthroughFsRo` type. pub fn new(cfg: super::Config) -> io::Result { let inner = PassthroughFs::new(cfg)?; Ok(PassthroughFsRo(inner)) } /// Internal: Run an `open()`-like function without allowing modifications or write access. /// /// That means: /// - Prevent access modes other than `O_RDONLY` and the following flags: /// - O_EXCL: We filter out `O_CREAT`, and then, its behavior will be undefined (except for /// block devices, which don’t really work with virtio-fs anyway). In any case, on a /// read-only filesystem, `O_CREAT | O_EXCL` will always give an error. /// - O_TMPFILE: Not allowed with `O_RDONLY`. /// - O_TRUNC: Undefined behavior with `O_RDONLY`, might truncate anyway. /// - Filter out `O_CREAT`, and return `EROFS` if the path does not exist yet /// /// `open_fn` runs the underlying open function, taking the potentially modified flags as an /// argument. fn rofs_open io::Result>(flags: u32, open_fn: F) -> io::Result { let cflags: libc::c_int = flags .try_into() .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; // `O_PATH` ignores all flags but `O_CLOEXEC | O_DIRECTORY | O_NOFOLLOW`, just allow it // wholesale if cflags & libc::O_PATH != 0 { return open_fn(flags); } if cflags & libc::O_ACCMODE != libc::O_RDONLY { return Err(erofs()); } // Problem: We would like to have an allowlist, not a denylist; `O_LARGEFILE` would be in // that allowlist (and is indeed set by guests), but `libc::O_LARGEFILE == 0`. The actual // value (in C) can vary between systems (and it seems that it does indeed vary), so we // cannot use an allowlist, and have to live with a denylist only. // For what it’s worth, here is what the allowlist would be (for comparison against the // denylist below): // - O_ACCMODE // -- already checked to be O_RDONLY // - O_APPEND | O_DSYNC | O_SYNC // -- only affect writes, so useless here, but not harmful // - O_CREAT // -- special-case below // - O_NOATIME // -- perfectly OK; in fact, we would rather force-set it, but cannot (see // --preserve-noatime) // - O_ASYNC | O_CLOEXEC | O_DIRECT | O_DIRECTORY | O_LARGEFILE | O_NOCTTY | O_NOFOLLOW | // O_NONBLOCK // -- Don’t have anything to do with writing in particular. // Note that at least `O_TMPFILE` occupies multiple bits, so we need to check exactly. Do // it for the other flags, too, why not. if cflags & libc::O_EXCL == libc::O_EXCL { // O_EXCL is undefined without O_CREAT (which we filter out below). Then again, on a // read-only filesystem, O_CREAT | O_EXCL will always return an error, so we can just // do that here. Maybe we should check whether to return EROFS or EEXIST, depending on // the case, but then again, if someone tries to create a new file on a read-only // filesystem, we can just tell them it’s EROFS. // (And it’s also weird to use O_CREAT | O_EXCL | O_RDONLY.) return Err(erofs()); } if cflags & libc::O_TMPFILE == libc::O_TMPFILE { // O_TMPFILE | O_RDONLY should have already resulted in EINVAL in the guest, but better // safeguard it explicitly. return Err(einval()); } if cflags & libc::O_TRUNC == libc::O_TRUNC { // Undefined with O_RDONLY, so we can do what we want. We need to error out, though, // lest passing it to the host will truncate the file even with O_RDONLY. return Err(einval()); } if cflags & libc::O_CREAT == 0 { open_fn(flags) } else { // Try to open without CREAT, if that fails, return EROFS open_fn(flags & !(libc::O_CREAT as u32)).map_err(|err| { if err.kind() == io::ErrorKind::NotFound { erofs() } else { err } }) } } } /// Create function definitions that always fall through to the corresponding function on `self.0` macro_rules! ops_allow { { $( fn $name:ident$(<$($gen_name:ident: $gen_trait:path),*>)?( &self $(, $($par_name:ident: $par_type:ty),*)? $(,)? )$( -> $ret:ty)?; )* } => { $( fn $name$(<$($gen_name: $gen_trait),*>)?( &self $(, $($par_name: $par_type),*)? )$( -> $ret)? { self.0.$name($($($par_name),*)?) } )* } } /// Create function definitions that always return `Err(erofs())` macro_rules! ops_forbid { { $( fn $name:ident$(<$($gen_name:ident: $gen_trait:path),*>)?( &self $(, $($par_name:ident: $par_type:ty),*)? $(,)? ) -> io::Result<$ret_ok:ty>; )* } => { $( fn $name$(<$($gen_name: $gen_trait),*>)?( &self $(, $($par_name: $par_type),*)? ) -> io::Result<$ret_ok> { Err(erofs()) } )* } } impl FileSystem for PassthroughFsRo { type Inode = ::Inode; type Handle = ::Handle; type DirIter = ::DirIter; // Execute these functions without restrictions ops_allow! { fn init(&self, capable: FsOptions) -> io::Result; fn destroy(&self); fn lookup(&self, ctx: Context, parent: Self::Inode, name: &CStr) -> io::Result; fn forget(&self, ctx: Context, inode: Self::Inode, count: u64); fn batch_forget(&self, ctx: Context, requests: Vec<(Self::Inode, u64)>); fn getattr(&self, ctx: Context, inode: Self::Inode, handle: Option, ) -> io::Result<(fuse::Attr, Duration)>; fn readlink(&self, ctx: Context, inode: Self::Inode) -> io::Result>; fn read( &self, ctx: Context, inode: Self::Inode, handle: Self::Handle, w: W, size: u32, offset: u64, lock_owner: Option, flags: u32, ) -> io::Result; fn flush( &self, ctx: Context, inode: Self::Inode, handle: Self::Handle, lock_owner: u64, ) -> io::Result<()>; fn fsync( &self, ctx: Context, inode: Self::Inode, datasync: bool, handle: Self::Handle, ) -> io::Result<()>; fn release( &self, ctx: Context, inode: Self::Inode, flags: u32, handle: Self::Handle, flush: bool, flock_release: bool, lock_owner: Option, ) -> io::Result<()>; fn statfs(&self, ctx: Context, inode: Self::Inode) -> io::Result; fn getxattr( &self, ctx: Context, inode: Self::Inode, name: &CStr, size: u32, ) -> io::Result; fn listxattr( &self, ctx: Context, inode: Self::Inode, size: u32, ) -> io::Result; fn readdir( &self, ctx: Context, inode: Self::Inode, handle: Self::Handle, size: u32, offset: u64, ) -> io::Result; fn fsyncdir( &self, ctx: Context, inode: Self::Inode, datasync: bool, handle: Self::Handle, ) -> io::Result<()>; fn releasedir( &self, ctx: Context, inode: Self::Inode, flags: u32, handle: Self::Handle, ) -> io::Result<()>; fn lseek( &self, ctx: Context, inode: Self::Inode, handle: Self::Handle, offset: u64, whence: u32, ) -> io::Result; fn syncfs(&self, ctx: Context, inode: Self::Inode) -> io::Result<()>; } // Refuse to run these functions, always returning EROFS. // Note: We assume that these functions must always fail on a read-only filesystem, so failing // without further checks should be safe and reasonable. However, the Linux kernel treats // EROFS more like a final barrier, i.e. something that is returned only if the operation would // succeed on a writable filesystem. For example, on an -o ro filesystem, `mkdir()` will not // return EROFS immediately, but first check whether the path already exists, and if so, return // EEXIST instead. That would be complicated though (and might introduce TOCTTOU problems), so // unconditionally returning EROFS seems like a more viable option for us. // (FWIW, the FUSE kernel driver does not seem to special-case EEXIST.) ops_forbid! { fn setattr( &self, _ctx: Context, _inode: Self::Inode, _attr: fuse::SetattrIn, _handle: Option, _valid: SetattrValid, ) -> io::Result<(fuse::Attr, Duration)>; fn symlink( &self, _ctx: Context, _linkname: &CStr, _parent: Self::Inode, _name: &CStr, _extensions: Extensions, ) -> io::Result; fn mknod( &self, _ctx: Context, _parent: Self::Inode, _name: &CStr, _mode: u32, _rdev: u32, _umask: u32, _extensions: Extensions, ) -> io::Result; fn mkdir( &self, _ctx: Context, _parent: Self::Inode, _name: &CStr, _mode: u32, _umask: u32, _extensions: Extensions, ) -> io::Result; fn unlink(&self, _ctx: Context, _parent: Self::Inode, _name: &CStr) -> io::Result<()>; fn rmdir(&self, _ctx: Context, _parent: Self::Inode, _name: &CStr) -> io::Result<()>; fn rename( &self, _ctx: Context, _olddir: Self::Inode, _oldname: &CStr, _newdir: Self::Inode, _newname: &CStr, _flags: u32, ) -> io::Result<()>; fn link( &self, _ctx: Context, _inode: Self::Inode, _newparent: Self::Inode, _newname: &CStr, ) -> io::Result; fn write( &self, _ctx: Context, _inode: Self::Inode, _handle: Self::Handle, _r: R, _size: u32, _offset: u64, _lock_owner: Option, _delayed_write: bool, _kill_priv: bool, _flags: u32, ) -> io::Result; fn fallocate( &self, _ctx: Context, _inode: Self::Inode, _handle: Self::Handle, _mode: u32, _offset: u64, _length: u64, ) -> io::Result<()>; fn setxattr( &self, _ctx: Context, _inode: Self::Inode, _name: &CStr, _value: &[u8], _flags: u32, _extra_flags: SetxattrFlags, ) -> io::Result<()>; fn removexattr(&self, _ctx: Context, _inode: Self::Inode, _name: &CStr) -> io::Result<()>; fn copyfilerange( &self, _ctx: Context, _inode_in: Self::Inode, _handle_in: Self::Handle, _offset_in: u64, _inode_out: Self::Inode, _handle_out: Self::Handle, _offset_out: u64, _len: u64, _flags: u64, ) -> io::Result; } fn open( &self, ctx: Context, inode: Self::Inode, kill_priv: bool, flags: u32, ) -> io::Result<(Option, OpenOptions)> { Self::rofs_open(flags, |flags| self.0.open(ctx, inode, kill_priv, flags)) } fn create( &self, ctx: Context, parent: Self::Inode, name: &CStr, _mode: u32, kill_priv: bool, flags: u32, _umask: u32, _extensions: Extensions, ) -> io::Result<(Entry, Option, OpenOptions)> { // We never want to create, but we should allow opening existing files let entry = self.lookup(ctx, parent, name).map_err(|err| { if err.kind() == io::ErrorKind::NotFound { erofs() } else { err } })?; let (handle, opts) = self.open(ctx, entry.inode, kill_priv, flags)?; Ok((entry, handle, opts)) } fn opendir( &self, ctx: Context, inode: Self::Inode, flags: u32, ) -> io::Result<(Option, OpenOptions)> { Self::rofs_open(flags, |flags| self.0.opendir(ctx, inode, flags)) } fn access(&self, ctx: Context, inode: Self::Inode, mask: u32) -> io::Result<()> { if mask & libc::W_OK as u32 != 0 { Err(erofs()) } else { self.0.access(ctx, inode, mask) } } } impl SerializableFileSystem for PassthroughFsRo { ops_allow! { fn prepare_serialization(&self, cancel: Arc); fn serialize(&self, state_pipe: File) -> io::Result<()>; fn deserialize_and_apply(&self, state_pipe: File) -> io::Result<()>; } } virtiofsd-1.13.0/src/passthrough/stat/file_status.rs000064400000000000000000000026271046102023000207270ustar 00000000000000// SPDX-License-Identifier: BSD-3-Clause #[cfg(target_env = "gnu")] pub use libc::statx as statx_st; #[cfg(target_env = "gnu")] pub use libc::{STATX_BASIC_STATS, STATX_MNT_ID}; // musl provides the 'struct statx', but without stx_mnt_id. // However, the libc crate does not provide libc::statx // if musl is used. So we add just the required struct and // constants to make it works. #[cfg(not(target_env = "gnu"))] #[repr(C)] pub struct statx_st_timestamp { pub tv_sec: i64, pub tv_nsec: u32, pub __statx_timestamp_pad1: [i32; 1], } #[cfg(not(target_env = "gnu"))] #[repr(C)] pub struct statx_st { pub stx_mask: u32, pub stx_blksize: u32, pub stx_attributes: u64, pub stx_nlink: u32, pub stx_uid: u32, pub stx_gid: u32, pub stx_mode: u16, __statx_pad1: [u16; 1], pub stx_ino: u64, pub stx_size: u64, pub stx_blocks: u64, pub stx_attributes_mask: u64, pub stx_atime: statx_st_timestamp, pub stx_btime: statx_st_timestamp, pub stx_ctime: statx_st_timestamp, pub stx_mtime: statx_st_timestamp, pub stx_rdev_major: u32, pub stx_rdev_minor: u32, pub stx_dev_major: u32, pub stx_dev_minor: u32, pub stx_mnt_id: u64, __statx_pad2: u64, __statx_pad3: [u64; 12], } #[cfg(not(target_env = "gnu"))] pub const STATX_BASIC_STATS: libc::c_uint = 0x07ff; #[cfg(not(target_env = "gnu"))] pub const STATX_MNT_ID: libc::c_uint = 0x1000; virtiofsd-1.13.0/src/passthrough/stat.rs000064400000000000000000000120231046102023000163740ustar 00000000000000// Copyright 2021 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use std::ffi::CStr; use std::io; use std::mem::MaybeUninit; use std::os::unix::io::AsRawFd; mod file_status; use crate::oslib; use file_status::{statx_st, STATX_BASIC_STATS, STATX_MNT_ID}; const EMPTY_CSTR: &[u8] = b"\0"; pub type MountId = u64; pub struct StatExt { pub st: libc::stat64, pub mnt_id: MountId, } /* * Fields in libc::statx are only valid if their respective flag in * .stx_mask is set. This trait provides functions that allow safe * access to the libc::statx components we are interested in. * * (The implementations of these functions need to check whether the * associated flag is set, and then extract the respective information * to return it.) */ trait SafeStatXAccess { fn stat64(&self) -> Option; fn mount_id(&self) -> Option; } impl SafeStatXAccess for statx_st { fn stat64(&self) -> Option { fn makedev(maj: libc::c_uint, min: libc::c_uint) -> libc::dev_t { libc::makedev(maj, min) } if self.stx_mask & STATX_BASIC_STATS != 0 { /* * Unfortunately, we cannot use an initializer to create the * stat64 object, because it may contain padding and reserved * fields (depending on the architecture), and it does not * implement the Default trait. * So we take a zeroed struct and set what we can. * (Zero in all fields is wrong, but safe.) */ let mut st = unsafe { MaybeUninit::::zeroed().assume_init() }; st.st_dev = makedev(self.stx_dev_major, self.stx_dev_minor); st.st_ino = self.stx_ino; st.st_mode = self.stx_mode as _; st.st_nlink = self.stx_nlink as _; st.st_uid = self.stx_uid; st.st_gid = self.stx_gid; st.st_rdev = makedev(self.stx_rdev_major, self.stx_rdev_minor); st.st_size = self.stx_size as _; st.st_blksize = self.stx_blksize as _; st.st_blocks = self.stx_blocks as _; st.st_atime = self.stx_atime.tv_sec; st.st_atime_nsec = self.stx_atime.tv_nsec as _; st.st_mtime = self.stx_mtime.tv_sec; st.st_mtime_nsec = self.stx_mtime.tv_nsec as _; st.st_ctime = self.stx_ctime.tv_sec; st.st_ctime_nsec = self.stx_ctime.tv_nsec as _; Some(st) } else { None } } fn mount_id(&self) -> Option { if self.stx_mask & STATX_MNT_ID != 0 { Some(self.stx_mnt_id) } else { None } } } fn get_mount_id(dir: &impl AsRawFd, path: &CStr) -> Option { let mut mount_id: libc::c_int = 0; let mut c_fh = oslib::CFileHandle::default(); oslib::name_to_handle_at(dir, path, &mut c_fh, &mut mount_id, libc::AT_EMPTY_PATH) .ok() .and(Some(mount_id as MountId)) } // Only works on Linux, and libc::SYS_statx is only defined for these // environments /// Performs a statx() syscall. libc provides libc::statx() that does /// the same, however, the system's libc may not have a statx() wrapper /// (e.g. glibc before 2.28), so linking to it may fail. /// libc::syscall() and libc::SYS_statx are always present, though, so /// we can safely rely on them. unsafe fn do_statx( dirfd: libc::c_int, pathname: *const libc::c_char, flags: libc::c_int, mask: libc::c_uint, statxbuf: *mut statx_st, ) -> libc::c_int { libc::syscall(libc::SYS_statx, dirfd, pathname, flags, mask, statxbuf) as libc::c_int } // Real statx() that depends on do_statx() pub fn statx(dir: &impl AsRawFd, path: Option<&CStr>) -> io::Result { let mut stx_ui = MaybeUninit::::zeroed(); // Safe because this is a constant value and a valid C string. let path = path.unwrap_or_else(|| unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }); // Safe because the kernel will only write data in `stx_ui` and we // check the return value. let res = unsafe { do_statx( dir.as_raw_fd(), path.as_ptr(), libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW, STATX_BASIC_STATS | STATX_MNT_ID, stx_ui.as_mut_ptr(), ) }; if res >= 0 { // Safe because we are only going to use the SafeStatXAccess // trait methods let stx = unsafe { stx_ui.assume_init() }; // if `statx()` doesn't provide the mount id (before kernel 5.8), // let's try `name_to_handle_at()`, if everything fails just use 0 let mnt_id = stx .mount_id() .or_else(|| get_mount_id(dir, path)) .unwrap_or(0); Ok(StatExt { st: stx .stat64() .ok_or_else(|| io::Error::from_raw_os_error(libc::ENOSYS))?, mnt_id, }) } else { Err(io::Error::last_os_error()) } } virtiofsd-1.13.0/src/passthrough/util.rs000064400000000000000000000161401046102023000164020ustar 00000000000000// Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. use crate::util::{other_io_error, ErrorContext, ResultErrorContext}; use std::ffi::{CStr, CString}; use std::fs::File; use std::os::unix::io::{AsRawFd, FromRawFd}; use std::{fmt, io}; /// Safe wrapper around libc::openat(). pub fn openat(dir_fd: &impl AsRawFd, path: &str, flags: libc::c_int) -> io::Result { let path_cstr = CString::new(path).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; // Safe because: // - CString::new() has returned success and thus guarantees `path_cstr` is a valid // NUL-terminated string // - this does not modify any memory // - we check the return value // We do not check `flags` because if the kernel cannot handle poorly specified flags then we // have much bigger problems. let fd = unsafe { libc::openat(dir_fd.as_raw_fd(), path_cstr.as_ptr(), flags) }; if fd >= 0 { // Safe because we just opened this fd Ok(unsafe { File::from_raw_fd(fd) }) } else { Err(io::Error::last_os_error()) } } /// Same as `openat()`, but produces more verbose errors. /// /// Do not use this for operations where the error is returned to the guest, as the raw OS error /// value will be clobbered. pub fn openat_verbose(dir_fd: &impl AsRawFd, path: &str, flags: libc::c_int) -> io::Result { openat(dir_fd, path, flags).err_context(|| path) } /// Open `/proc/self/fd/{fd}` with the given flags to effectively duplicate the given `fd` with new /// flags (e.g. to turn an `O_PATH` file descriptor into one that can be used for I/O). pub fn reopen_fd_through_proc( fd: &impl AsRawFd, flags: libc::c_int, proc_self_fd: &File, ) -> io::Result { // Clear the `O_NOFOLLOW` flag if it is set since we need to follow the `/proc/self/fd` symlink // to get the file. openat( proc_self_fd, format!("{}", fd.as_raw_fd()).as_str(), flags & !libc::O_NOFOLLOW, ) } /// Returns true if it's safe to open this inode without O_PATH. pub fn is_safe_inode(mode: u32) -> bool { // Only regular files and directories are considered safe to be opened from the file // server without O_PATH. matches!(mode & libc::S_IFMT, libc::S_IFREG | libc::S_IFDIR) } pub fn ebadf() -> io::Error { io::Error::from_raw_os_error(libc::EBADF) } pub fn einval() -> io::Error { io::Error::from_raw_os_error(libc::EINVAL) } pub fn erofs() -> io::Error { io::Error::from_raw_os_error(libc::EROFS) } /** * Errors that `get_path_by_fd()` can encounter. * * This specialized error type exists so that * [`crate::passthrough::device_state::preserialization::proc_paths`] can decide which errors it * considers recoverable. */ #[derive(Debug)] pub(crate) enum FdPathError { /// `readlinkat()` failed with the contained error. ReadLink(io::Error), /// Link name is too long. TooLong, /// Link name is not a valid C string. InvalidCString(io::Error), /// Returned path (contained string) is not a plain file path. NotAFile(String), /// Returned path (contained string) is reported to be deleted, i.e. no longer valid. Deleted(String), } /// Looks up an FD's path through /proc/self/fd pub(crate) fn get_path_by_fd( fd: &impl AsRawFd, proc_self_fd: &impl AsRawFd, ) -> Result { let fname = format!("{}\0", fd.as_raw_fd()); let fname_cstr = CStr::from_bytes_with_nul(fname.as_bytes()).unwrap(); let max_len = libc::PATH_MAX as usize; // does not include final NUL byte let mut link_target = vec![0u8; max_len + 1]; // make space for NUL byte let ret = unsafe { libc::readlinkat( proc_self_fd.as_raw_fd(), fname_cstr.as_ptr(), link_target.as_mut_ptr().cast::(), max_len, ) }; if ret < 0 { return Err(FdPathError::ReadLink(io::Error::last_os_error())); } else if ret as usize == max_len { return Err(FdPathError::TooLong); } link_target.truncate(ret as usize + 1); let link_target_cstring = CString::from_vec_with_nul(link_target) .map_err(|err| FdPathError::InvalidCString(other_io_error(err)))?; let link_target_str = link_target_cstring.to_string_lossy(); let pre_slash = link_target_str.split('/').next().unwrap(); if pre_slash.contains(':') { return Err(FdPathError::NotAFile(link_target_str.into_owned())); } if let Some(path) = link_target_str.strip_suffix(" (deleted)") { return Err(FdPathError::Deleted(path.to_owned())); } Ok(link_target_cstring) } impl From for io::Error { fn from(err: FdPathError) -> Self { match err { FdPathError::ReadLink(err) => err.context("readlink"), FdPathError::TooLong => other_io_error("Path returned from readlink is too long"), FdPathError::InvalidCString(err) => err.context("readlink returned invalid path"), FdPathError::NotAFile(path) => other_io_error(format!("Not a file ({path})")), FdPathError::Deleted(path) => other_io_error(format!("Inode deleted ({path})")), } } } impl fmt::Display for FdPathError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { FdPathError::ReadLink(err) => write!(f, "readlink: {err}"), FdPathError::TooLong => write!(f, "Path returned from readlink is too long"), FdPathError::InvalidCString(err) => write!(f, "readlink returned invalid path: {err}"), FdPathError::NotAFile(path) => write!(f, "Not a file ({path})"), FdPathError::Deleted(path) => write!(f, "Inode deleted ({path})"), } } } impl std::error::Error for FdPathError {} /// Debugging helper function: Turn the given file descriptor into a string representation we can /// show the user. If `proc_self_fd` is given, try to obtain the actual path through the symlink /// in /proc/self/fd; otherwise (or on error), just print the integer representation (as /// "{fd:%i}"). pub fn printable_fd(fd: &impl AsRawFd, proc_self_fd: Option<&impl AsRawFd>) -> String { if let Some(Ok(path)) = proc_self_fd.map(|psf| get_path_by_fd(fd, psf)) { match path.into_string() { Ok(s) => s, Err(err) => err.into_cstring().to_string_lossy().into_owned(), } } else { format!("{{fd:{}}}", fd.as_raw_fd()) } } pub fn relative_path<'a>(path: &'a CStr, prefix: &CStr) -> io::Result<&'a CStr> { let mut relative_path = path .to_bytes_with_nul() .strip_prefix(prefix.to_bytes()) .ok_or_else(|| { other_io_error(format!( "Path {path:?} is outside the directory ({prefix:?})" )) })?; // Remove leading / if left while let Some(prefixless) = relative_path.strip_prefix(b"/") { relative_path = prefixless; } // Must succeed: Was a `CStr` before, converted to `&[u8]` via `to_bytes_with_nul()`, so must // still contain exactly one NUL byte at the end of the slice Ok(CStr::from_bytes_with_nul(relative_path).unwrap()) } virtiofsd-1.13.0/src/passthrough/xattrmap.rs000064400000000000000000000677571046102023000173110ustar 00000000000000//! The `xattrmap` module is used to translate extended attribute operations //! between the server (virtiofsd) and the client (the virtio-fs guest kernel //! module). //! //! Here's a non-exhaustive list of use-cases in which it may be beneficial to //! install an extended attribute mapping: //! //! * The guest VM process is executing at a privilege level where it can't //! actually modify the extended attribute on the host. In this case, one //! may choose to map those guest's extended attributes to a "user." //! namespace on the host. //! //! * An extended attribute mapping can partition a host's extended attributes //! from a guest's to prevent the guest from clobbering extended attributes //! that the host has set and depends on. //! //! ## Rules //! //! The entity that launches virtiofsd may provide an "extended attributes //! mapping" (or "xattrmap") that defines how extended attributes should be //! translated. An xattrmap is really just a series of rules with a specific //! syntax. When translating an xattr, the xattrmap rules are traversed in the //! order that the mappings were originally written. The traversal is terminated //! on the first rule that matches the xattr. //! //! The xattrmap _must_ have a terminating rule. //! //! ### Reference //! //! There are two ways of expressing an xattrmap rule: //! //! 1. `:type:scope:key:prepend:` //! 2. `:map:key:prepend:` - this is just syntactic sugar for expressing a common //! rule. It is equivalent to `:prefix:all:key:prepend`. //! //! An xattrmap is just a series of these rules separated by whitespace. Each rule //! can have its own delimiter. The colon (`:`) was just used here as an arbitary //! example. Use a delimiter that you find readable. //! //! Let's dissect the xattrmap rule syntax: `:type:scope:key:prepend:`. //! //! | type | description | //! | - | - | //! | prefix | The value of `key` is prepended to xattrs originating from the client (i.e., `{get,set,remove}xattr()`). The value of `prepend` is stripped from the server's reply to `listxattr()`. | //! | ok | If the xattr originating from the client is prefixed with `key`, or if an xattr in a server reply is prefixed with `prepend` it passes through unchanged. | //! | bad | If the xattr originating from the client is prefixed with `key` it is denied with `EPERM`. If the xattr in a server reply is prefixed with `prepend` it is hidden from the client and not included in the reply. | //! | unsupported | If a client tries to use a name matching 'key' it's denied using ENOTSUP; when the server passes an attribute name matching 'prepend' it's hidden. In many ways its use is very like 'ok' as either an explicit terminator or for special handling of certain patterns. | //! //! `ok` and `bad` can both be used as simple terminators for an xattrmap to //! satisfy the expectation that every xattrmap has a terminator. For example, //! `:ok:all:::`, will vacuously terminate all mappings. Placing a rule like //! this at the end of the xattrmap rules is a common way of providing a //! terminator. //! //! | scope | description | //! | - | - | //! | server | Match on xattrnames in the server reply that are prefixed with `prepend`. | //! | client | Match on xattrnames from the client that are prefixed with `key`. | //! | all | Matches on both server replies and client requests as described for `server` and `client` scopes. | //! //! ### Examples //! //! These have been taken almost verbatim from the original virtiofsd //! documentation in the QEMU source code. //! //! #### Example 1 //! //! ```text //! :prefix:all::user.virtiofs.: //! :bad:all::: //! ``` //! //! There are two rules in this xattrmap. The first rule prefixes and strips //! `user.virtiofs.` from client requests and server replies respectively. //! //! The second rule hides any non-prefixed extended attributes that the host //! set. //! //! #### Example 2 //! //! ```text //! /prefix/all/trusted./user.virtiofs./ //! /bad/server//trusted./ //! /bad/client/user.virtiofs.// //! /ok/all/// //! ``` //! //! The first rule prefixes client xattrnames with `trusted.` and strips //! `user.virtiofs.` from xattrnames included in the server reply. //! //! The second rule hides unprefixed `trusted.` attributes on the host. //! //! The third rule prevents the guest from manipulating the `user.virtiofs.` //! namespace directly. //! //! The final rule is the terminator and allows all remaining attributes //! through unchanged. #![deny(missing_docs)] use std::borrow::Cow; use std::convert::TryFrom; use std::ffi::{CStr, CString}; use std::fmt; use std::iter::Peekable; /// Expected error conditions with respect to parsing an XattrMap or /// attempting to match on a rule. #[derive(Debug, Eq, PartialEq)] pub enum ErrorKind { /// Scope is not one of: "all", "server", "client". InvalidScope { /// The unexpected value parsed from the input stream. got: String, /// A list of the expected values. expected: String, }, /// Type is not one of "prefix", "ok", "bad", or "map". InvalidType { /// The unexpected value parsed from the input stream. got: String, /// A list of the expected values. expected: String, }, /// A delimiter has been found that does not match the delimiter /// the rule started with. InvalidDelimiter, /// The rule is missing fields. IncompleteRule, /// There may only be one `map` rule and it must be the final /// rule; if this error is returned, then multiple map rules /// exist or one exists and it is not the final rule. MapRuleViolation, /// The input stream doesn't contain any rules. NoRulesProvided, /// None of the rules matched on the input. UnterminatedMapping, } impl std::error::Error for ErrorKind {} impl fmt::Display for ErrorKind { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{self:?}") } } /// Errors specific to XattrMap operations. #[derive(Debug, Eq, PartialEq)] pub struct Error { /// The specific error condition that was detected. pub cause: ErrorKind, /// The culpable rule, if any. pub rule: Option, } impl std::error::Error for Error { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { Some(&self.cause) } } impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{self:?}") } } impl From for Error { fn from(ek: ErrorKind) -> Self { Self { cause: ek, rule: None, } } } bitflags::bitflags! { struct Scope: u8 { const CLIENT = 0b01; const SERVER = 0b10; } } impl Scope { fn from_bytes>(bytes: B) -> Result { let bytes = bytes.as_ref(); Ok(match bytes { b"all" => Scope::CLIENT | Scope::SERVER, b"client" => Scope::CLIENT, b"server" => Scope::SERVER, _ => { return Err(ErrorKind::InvalidScope { got: String::from_utf8_lossy(bytes).into(), expected: ["all", "client", "server"].join(", "), }) } }) } } #[derive(Copy, Clone, Debug, Eq, PartialEq)] enum Type { Prefix, Okay, Bad, Unsupported, Map, } impl Type { fn from_bytes>(bytes: B) -> Result { let bytes = bytes.as_ref(); Ok(match bytes { b"prefix" => Type::Prefix, b"ok" => Type::Okay, b"bad" => Type::Bad, b"unsupported" => Type::Unsupported, b"map" => Type::Map, _ => { return Err(ErrorKind::InvalidType { got: String::from_utf8_lossy(bytes).into(), expected: ["prefix", "ok", "bad", "map"].join(", "), }) } }) } } #[derive(Clone, Debug, Eq, PartialEq)] struct Rule { scope: Scope, type_: Type, key: CString, prepend: CString, } impl Rule { fn matches(&self, scope: Scope, xattr_name: &[u8]) -> bool { if !self.scope.contains(scope) { return false; } match scope { Scope::CLIENT => xattr_name.starts_with(self.key.to_bytes()), Scope::SERVER => xattr_name.starts_with(self.prepend.to_bytes()), _ => panic!("ambiguous scope"), } } fn from_tokens(tokens: &mut Peekable) -> Result where I: Iterator, { // The caller has already trimmed the whitespace leading up to here, // so the next element should be a rule delimiter. let delim = tokens.next().ok_or(ErrorKind::InvalidDelimiter)?; // This exists instead of using take_while() because take_while() will // consume the delimiter (if it exists) and it won't complain if it doesn't // exist. That means that we wouldn't be able to check for an unterminated // rule error like this: // :prefix:all:trusted.:user.vm. // ^ missing ':' let mut next_token = || { let mut bytes = vec![]; loop { if let Some(ch) = tokens.peek() { if !ch.eq(&delim) { bytes.push(*ch as u8); let _ = tokens.next(); } else { // advance past delimiter let _ = tokens.next(); break; } } else { // Ran out of tokens without finding a terminating delimiter return Err(ErrorKind::IncompleteRule); } } Ok(bytes) }; let type_ = Type::from_bytes(next_token()?)?; Ok(match type_ { Type::Map => Rule { type_, scope: Scope::CLIENT | Scope::SERVER, key: CString::new(next_token()?).unwrap(), prepend: CString::new(next_token()?).unwrap(), }, Type::Prefix | Type::Okay | Type::Bad | Type::Unsupported => { let scope = Scope::from_bytes(next_token()?)?; Rule { type_, scope, key: CString::new(next_token()?).unwrap(), prepend: CString::new(next_token()?).unwrap(), } } }) } fn expand_map_type(rule: Self) -> Vec { assert_eq!(rule.type_, Type::Map); // 1st: Prefix matches/everything let mut rules = vec![Rule { type_: Type::Prefix, scope: Scope::CLIENT | Scope::SERVER, key: rule.key.clone(), prepend: rule.prepend.clone(), }]; let last_rule_type = if !rule.key.as_bytes().is_empty() { // 2nd: Hide non-prefixed but matching entries on the host, and // stop the client accessing prefixed attributes directly rules.push(Rule { type_: Type::Bad, scope: Scope::CLIENT | Scope::SERVER, key: rule.prepend, prepend: rule.key, }); Type::Okay } else { Type::Bad }; // Last: Everything else rules.push(Rule { type_: last_rule_type, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("").unwrap(), }); rules } } /// A return value that indicates the xattr name input has passed through /// the XattrMap where a rule was successfully matched and applied to the /// xattrname. #[derive(Debug, Eq, PartialEq)] pub enum AppliedRule<'a> { /// Server should the interior value onward through to the requested operation. Pass(Cow<'a, CStr>), /// Server should return EPERM (i.e., this matched on a `bad` rule). Deny, /// Server should return ENOTSUP (i.e., this matched on a `unsupported` rule). Unsupported, } /// A collection of well-formed xattr translation rules. #[derive(Clone, Debug, Eq, PartialEq)] pub struct XattrMap { rules: Vec, } impl XattrMap { /// Applies xattrmap rules to a single extended attribute name. /// /// This should be called *before* any other extended attribute /// operation is performed on the host file system. /// /// Client request -> this method -> {get,set,remove}xattr() -> server response /// /// See also: getxattr(2), setxattr(2), removexattr(2) pub fn map_client_xattr<'a>(&self, xattr_name: &'a CStr) -> Result, Error> { let rule = self.find_rule(Scope::CLIENT, xattr_name.to_bytes())?; Ok(match rule.type_ { Type::Okay => AppliedRule::Pass(Cow::Borrowed(xattr_name)), Type::Bad => AppliedRule::Deny, Type::Unsupported => AppliedRule::Unsupported, Type::Prefix => { let mut concat = rule.prepend.as_bytes().to_vec(); concat.extend_from_slice(xattr_name.to_bytes()); AppliedRule::Pass(Cow::Owned(CString::new(concat).unwrap())) } Type::Map => panic!("Unexpanded MAP rule was found."), }) } /// Applies xattrmap rules to a list of extended attribute names. /// /// This should be called *before* replying to the client with the list /// of extended attribute names. /// /// Client request -> listxattr() -> this method -> server response /// /// See also: listxattr(2) pub fn map_server_xattrlist(&self, xattr_names: Vec) -> Result, Error> { let mut filtered = Vec::with_capacity(xattr_names.len()); let unprocessed = xattr_names.split(|b| *b == 0).filter(|bs| !bs.is_empty()); for xattr_name in unprocessed { let rule = self.find_rule(Scope::SERVER, xattr_name)?; let processed = match rule.type_ { Type::Bad | Type::Unsupported => continue, // hide this from the client Type::Okay => xattr_name, Type::Prefix => &xattr_name[rule.prepend.as_bytes().len()..], // strip prefix Type::Map => panic!("Unexpanded MAP rule was found."), }; filtered.extend_from_slice(processed); filtered.push(0); } if filtered.is_empty() { filtered.push(0); } filtered.shrink_to_fit(); Ok(filtered) } fn find_rule(&self, scope: Scope, xattr_name: &[u8]) -> Result<&Rule, Error> { let rule = self .rules .iter() .find(|r| r.matches(scope, xattr_name)) .ok_or(ErrorKind::UnterminatedMapping) .map_err(|e| Error { cause: e, rule: None, })?; Ok(rule) } } impl TryFrom<&str> for XattrMap { type Error = Error; fn try_from(input: &str) -> Result { let trimmed = input.trim(); let mut unparsed = trimmed.chars().peekable(); let mut rules: Vec = vec![]; while unparsed.peek().is_some() { // Skip any whitespace between rules if let Some(ch) = unparsed.peek() { if ch.is_ascii_whitespace() { let _ = unparsed.next(); continue; } } let rule = Rule::from_tokens(&mut unparsed).map_err(|e| Error { cause: e, rule: Some(rules.len() + 1), })?; if rule.type_ == Type::Map { // There may only be one 'map' rule and it must be the final rule if unparsed.peek().is_some() { return Err(Error { rule: Some(rules.len() + 1), cause: ErrorKind::MapRuleViolation, }); } rules.append(&mut Rule::expand_map_type(rule)); } else { rules.push(rule); }; } if rules.is_empty() { return Err(ErrorKind::NoRulesProvided.into()); } Ok(Self { rules }) } } #[cfg(test)] mod tests { use super::*; #[test] fn test_parser_can_parse_single_rule() { let input = ":prefix:client:trusted.:user.virtiofs.:"; let actual = XattrMap::try_from(input).unwrap(); let expected = XattrMap { rules: vec![Rule { type_: Type::Prefix, scope: Scope::CLIENT, key: CString::new("trusted.").unwrap(), prepend: CString::new("user.virtiofs.").unwrap(), }], }; assert_eq!(actual, expected); } #[test] fn test_parser_can_parse_multiple_valid_rules() { let input = ":prefix:all::user.virtiofs.::bad:all:::"; let actual = XattrMap::try_from(input).unwrap(); let expected = XattrMap { rules: vec![ Rule { type_: Type::Prefix, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("user.virtiofs.").unwrap(), }, Rule { type_: Type::Bad, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("").unwrap(), }, ], }; assert_eq!(actual, expected); } #[test] fn test_parser_can_parse_rules_separated_by_whitespace() { let input = r#" /prefix/all/trusted./user.virtiofs./ /bad/server//trusted./ /bad/client/user.virtiofs.// /ok/all/// "#; let actual = XattrMap::try_from(input).unwrap(); let expected = XattrMap { rules: vec![ Rule { type_: Type::Prefix, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("trusted.").unwrap(), prepend: CString::new("user.virtiofs.").unwrap(), }, Rule { type_: Type::Bad, scope: Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("trusted.").unwrap(), }, Rule { type_: Type::Bad, scope: Scope::CLIENT, key: CString::new("user.virtiofs.").unwrap(), prepend: CString::new("").unwrap(), }, Rule { type_: Type::Okay, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("").unwrap(), }, ], }; assert_eq!(actual, expected); } #[test] fn test_parser_emits_incomplete_rule_error() { let input = ":prefix:client:hi"; let actual = XattrMap::try_from(input).unwrap_err(); let expected = Error { rule: Some(1), cause: ErrorKind::IncompleteRule, }; assert_eq!(actual, expected); } #[test] fn test_parser_emits_error_when_multiple_map_rules_exist() { let input = ":map:trusted.:virtiofs.user.::map:trusted.:virtiofs.user.:"; let actual = XattrMap::try_from(input).unwrap_err(); let expected = Error { rule: Some(1), cause: ErrorKind::MapRuleViolation, }; assert_eq!(actual, expected); } #[test] fn test_parser_expands_map_rule_with_empty_key() { let input = ":map::user.virtiofs.:"; let actual = XattrMap::try_from(input).unwrap(); let expected = XattrMap { rules: vec![ Rule { type_: Type::Prefix, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("user.virtiofs.").unwrap(), }, Rule { type_: Type::Bad, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("").unwrap(), }, ], }; assert_eq!(actual, expected); } #[test] fn test_parser_expands_map_rule_with_key_and_prepend() { let input = ":map:trusted.:user.virtiofs.:"; let actual = XattrMap::try_from(input).unwrap(); let expected = XattrMap { rules: vec![ Rule { type_: Type::Prefix, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("trusted.").unwrap(), prepend: CString::new("user.virtiofs.").unwrap(), }, Rule { type_: Type::Bad, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("user.virtiofs.").unwrap(), prepend: CString::new("trusted.").unwrap(), }, Rule { type_: Type::Okay, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("").unwrap(), }, ], }; assert_eq!(actual, expected); } #[test] fn test_parser_emits_error_when_invalid_type_is_used() { let input = ":TOMATOPIRATE:trusted.:virtiofs.user.:"; assert!(XattrMap::try_from(input).is_err()); } #[test] fn test_parser_emits_error_when_invalid_scope_is_used() { let input = "/prefix/helloworld///"; assert!(XattrMap::try_from(input).is_err()); } #[test] fn test_parser_emits_error_when_no_rules_are_provided() { let input = " "; let actual = XattrMap::try_from(input).unwrap_err(); let expected = Error { rule: None, cause: ErrorKind::NoRulesProvided, }; assert_eq!(actual, expected); } #[test] fn test_parser_can_parse_rules_with_different_delimiters() { let input = ":prefix:all:trusted.:user.virtiofs.: /prefix/all/trusted./user.virtiofs./"; let expected_rule = Rule { type_: Type::Prefix, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("trusted.").unwrap(), prepend: CString::new("user.virtiofs.").unwrap(), }; let expected = XattrMap { rules: vec![expected_rule.clone(), expected_rule], }; let actual = XattrMap::try_from(input).unwrap(); assert_eq!(actual, expected); } #[test] fn test_rule_ok_all() { let map = XattrMap { rules: vec![Rule { type_: Type::Okay, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("").unwrap(), }], }; let input = CString::new("user.virtiofs.potato").unwrap(); let actual = map.map_client_xattr(&input).unwrap(); let expected = AppliedRule::Pass(CString::new("user.virtiofs.potato").unwrap().into()); assert_eq!(actual, expected); } #[test] fn test_rule_bad_hides_xattr_names_from_client() { let input = b"security.secret\x00boring_attr".to_vec(); let map = XattrMap { rules: vec![ Rule { type_: Type::Bad, scope: Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("security.").unwrap(), }, Rule { type_: Type::Okay, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("").unwrap(), }, ], }; let actual = map.map_server_xattrlist(input).unwrap(); let expected = b"boring_attr\x00"; assert_eq!(actual.as_slice(), expected); } #[test] fn test_rule_unsupported_hides_xattr_names_from_client() { let input = b"security.secret\x00boring_attr".to_vec(); let map = XattrMap { rules: vec![ Rule { type_: Type::Unsupported, scope: Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("security.").unwrap(), }, Rule { type_: Type::Okay, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("").unwrap(), }, ], }; let actual = map.map_server_xattrlist(input).unwrap(); let expected = b"boring_attr\x00"; assert_eq!(actual.as_slice(), expected); } #[test] fn test_rule_bad_denies_the_client_request() { let map = XattrMap { rules: vec![Rule { type_: Type::Bad, scope: Scope::CLIENT, key: CString::new("").unwrap(), prepend: CString::new("").unwrap(), }], }; let input = CString::new("virtiofs.").unwrap(); let actual = map.map_client_xattr(&input).unwrap(); let expected = AppliedRule::Deny; assert_eq!(actual, expected); } #[test] fn test_rule_unsupported_not_support_the_client_request() { let map = XattrMap { rules: vec![Rule { type_: Type::Unsupported, scope: Scope::CLIENT, key: CString::new("").unwrap(), prepend: CString::new("").unwrap(), }], }; let input = CString::new("virtiofs.").unwrap(); let actual = map.map_client_xattr(&input).unwrap(); let expected = AppliedRule::Unsupported; assert_eq!(actual, expected); } #[test] fn test_rule_prefix_prepends_xattr_names_from_client() { let map = XattrMap { rules: vec![Rule { type_: Type::Prefix, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("trusted.").unwrap(), prepend: CString::new("virtiofs.user.").unwrap(), }], }; let input = CString::new("trusted.secret_thing").unwrap(); let actual = map.map_client_xattr(&input).unwrap(); let expected = AppliedRule::Pass(Cow::Owned( CString::new("virtiofs.user.trusted.secret_thing").unwrap(), )); assert_eq!(actual, expected); } #[test] fn test_rule_prefix_strips_prefixes_from_server() { let map = XattrMap { rules: vec![Rule { type_: Type::Prefix, scope: Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("virtiofs.user.").unwrap(), }], }; let list = b"virtiofs.user.x".to_vec(); let actual = map.map_server_xattrlist(list).unwrap(); let expected = b"x\x00".to_vec(); assert_eq!(actual, expected); } #[test] fn test_rule_ok_allows_xattr_names_to_pass_through_unchanged() { let map = XattrMap { rules: vec![Rule { type_: Type::Okay, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("allow.").unwrap(), prepend: CString::new("allow.").unwrap(), }], }; let input = CString::new("allow.y").unwrap(); let actual = map.map_client_xattr(&input).unwrap(); let expected = AppliedRule::Pass(Cow::Owned(CString::new("allow.y").unwrap())); assert_eq!(actual, expected); let list = b"allow.y\x00".to_vec(); let expected = list.clone(); let actual = map.map_server_xattrlist(list).unwrap(); assert_eq!(actual, expected); } } virtiofsd-1.13.0/src/read_dir.rs000064400000000000000000000117321046102023000146310ustar 00000000000000// Copyright 2020 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use crate::filesystem::{DirEntry, DirectoryIterator}; use std::ffi::CStr; use std::io; use std::mem::size_of; use std::ops::{Deref, DerefMut}; use std::os::unix::io::AsRawFd; use vm_memory::ByteValued; #[repr(C, packed)] #[derive(Default, Clone, Copy)] struct LinuxDirent64 { d_ino: libc::ino64_t, d_off: libc::off64_t, d_reclen: libc::c_ushort, d_ty: libc::c_uchar, } unsafe impl ByteValued for LinuxDirent64 {} #[derive(Default)] pub struct ReadDir

{ buf: P, current: usize, end: usize, } impl> ReadDir

{ pub fn new(dir: &D, offset: libc::off64_t, buf: P) -> io::Result { // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::lseek64(dir.as_raw_fd(), offset, libc::SEEK_SET) }; if res < 0 { return Err(io::Error::last_os_error()); } // Safe because we used lseek() to get to the correct position unsafe { Self::new_no_seek(dir, buf) } } /// Continue reading from the current position in the directory without seeking. /// /// # Safety /// Caller must ensure the current position is valid, for example, by exclusively using this /// function on a given FD, potentially repeatedly. pub unsafe fn new_no_seek(dir: &D, mut buf: P) -> io::Result { // Safe because the kernel guarantees that it will only write to `buf` and we check the // return value. let res = unsafe { libc::syscall( libc::SYS_getdents64, dir.as_raw_fd(), buf.as_mut_ptr() as *mut LinuxDirent64, buf.len() as libc::c_int, ) }; if res < 0 { return Err(io::Error::last_os_error()); } Ok(ReadDir { buf, current: 0, end: res as usize, }) } } impl

ReadDir

{ /// Returns the number of bytes from the internal buffer that have not yet been consumed. pub fn remaining(&self) -> usize { self.end.saturating_sub(self.current) } } impl> DirectoryIterator for ReadDir

{ fn next(&mut self) -> Option { let rem = &self.buf[self.current..self.end]; if rem.is_empty() { return None; } // We only use debug asserts here because these values are coming from the kernel and we // trust them implicitly. debug_assert!( rem.len() >= size_of::(), "not enough space left in `rem`" ); let (front, back) = rem.split_at(size_of::()); let dirent64 = LinuxDirent64::from_slice(front).expect("unable to get LinuxDirent64 from slice"); let namelen = dirent64.d_reclen as usize - size_of::(); debug_assert!(namelen <= back.len(), "back is smaller than `namelen`"); // The kernel will pad the name with additional nul bytes until it is 8-byte aligned so // we need to strip those off here. let name = strip_padding(&back[..namelen]); let entry = DirEntry { ino: dirent64.d_ino, offset: dirent64.d_off as u64, type_: dirent64.d_ty as u32, name, }; debug_assert!( rem.len() >= dirent64.d_reclen as usize, "rem is smaller than `d_reclen`" ); self.current += dirent64.d_reclen as usize; Some(entry) } } // Like `CStr::from_bytes_with_nul` but strips any bytes after the first '\0'-byte. Panics if `b` // doesn't contain any '\0' bytes. fn strip_padding(b: &[u8]) -> &CStr { // It would be nice if we could use memchr here but that's locked behind an unstable gate. let pos = b .iter() .position(|&c| c == 0) .expect("`b` doesn't contain any nul bytes"); // Safe because we are creating this string with the first nul-byte we found so we can // guarantee that it is nul-terminated and doesn't contain any interior nuls. unsafe { CStr::from_bytes_with_nul_unchecked(&b[..=pos]) } } #[cfg(test)] mod test { use super::*; #[test] fn padded_cstrings() { assert_eq!(strip_padding(b".\0\0\0\0\0\0\0").to_bytes(), b"."); assert_eq!(strip_padding(b"..\0\0\0\0\0\0").to_bytes(), b".."); assert_eq!( strip_padding(b"normal cstring\0").to_bytes(), b"normal cstring" ); assert_eq!(strip_padding(b"\0\0\0\0").to_bytes(), b""); assert_eq!( strip_padding(b"interior\0nul bytes\0\0\0").to_bytes(), b"interior" ); } #[test] #[should_panic(expected = "`b` doesn't contain any nul bytes")] fn no_nul_byte() { strip_padding(b"no nul bytes in string"); } } virtiofsd-1.13.0/src/sandbox.rs000064400000000000000000000642561046102023000145270ustar 00000000000000// Copyright 2020 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use crate::{idmap, oslib, util}; use idmap::{GidMap, IdMapSetUpPipeMessage, UidMap}; use std::ffi::CString; use std::fs::{self, File}; use std::io::{Read, Write}; use std::os::fd::OwnedFd; use std::os::unix::io::{AsRawFd, FromRawFd}; use std::path::Path; use std::process::{self, Command}; use std::str::FromStr; use std::{error, fmt, io}; use vhost::vhost_user::Listener; #[derive(Debug)] pub enum Error { /// Failed to bind mount `/proc/self/fd` into a temporary directory. BindMountProcSelfFd(io::Error), /// Failed to bind mount shared directory. BindMountSharedDir(io::Error), /// Failed to change to the old root directory. ChdirOldRoot(io::Error), /// Failed to change to the new root directory. ChdirNewRoot(io::Error), /// Call to libc::chroot returned an error. Chroot(io::Error), /// Failed to change to the root directory after the chroot call. ChrootChdir(io::Error), /// Failed to clean the properties of the mount point. CleanMount(io::Error), /// Failed to create a temporary directory. CreateTempDir(io::Error), /// Failed to drop supplemental groups. DropSupplementalGroups(io::Error), /// Call to libc::fork returned an error. Fork(io::Error), /// Failed to get the number of supplemental groups. GetSupplementalGroups(io::Error), /// Error bind-mounting a directory. MountBind(io::Error), /// Failed to mount old root. MountOldRoot(io::Error), /// Error mounting proc. MountProc(io::Error), /// Failed to mount new root. MountNewRoot(io::Error), /// Error mounting target directory. MountTarget(io::Error), /// Failed to open `/proc/self/mountinfo`. OpenMountinfo(io::Error), /// Failed to open new root. OpenNewRoot(io::Error), /// Failed to open old root. OpenOldRoot(io::Error), /// Failed to open `/proc/self`. OpenProcSelf(io::Error), /// Failed to open `/proc/self/fd`. OpenProcSelfFd(io::Error), /// Error switching root directory. PivotRoot(io::Error), /// Failed to remove temporary directory. RmdirTempDir(io::Error), /// Failed to lazily unmount old root. UmountOldRoot(io::Error), /// Failed to lazily unmount temporary directory. UmountTempDir(io::Error), /// Call to libc::unshare returned an error. Unshare(io::Error), /// Failed to execute `newgidmap(1)`. WriteGidMap(String), /// Failed to write to `/proc/self/setgroups`. WriteSetGroups(io::Error), /// Failed to execute `newuidmap(1)`. WriteUidMap(String), /// Sandbox mode unavailable for non-privileged users SandboxModeInvalidUID, /// Setting uid_map is only allowed inside a namespace for non-privileged users SandboxModeInvalidUidMap, /// Setting gid_map is only allowed inside a namespace for non-privileged users SandboxModeInvalidGidMap, } impl error::Error for Error {} impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use self::Error::{ SandboxModeInvalidGidMap, SandboxModeInvalidUID, SandboxModeInvalidUidMap, WriteGidMap, WriteUidMap, }; match self { SandboxModeInvalidUID => { write!( f, "sandbox mode 'chroot' can only be used by \ root (Use '--sandbox namespace' instead)" ) } SandboxModeInvalidUidMap => { write!( f, "uid_map can only be used by unprivileged user where sandbox mod is namespace \ (Use '--sandbox namespace' instead)" ) } SandboxModeInvalidGidMap => { write!( f, "gid_map can only be used by unprivileged user where sandbox mod is namespace \ (Use '--sandbox namespace' instead)" ) } WriteUidMap(msg) => write!(f, "write to uid map failed: {msg}"), WriteGidMap(msg) => write!(f, "write to gid map failed: {msg}"), _ => write!(f, "{self:?}"), } } } /// Mechanism to be used for setting up the sandbox. #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum SandboxMode { /// Create the sandbox using Linux namespaces. Namespace, /// Create the sandbox using chroot. Chroot, /// Don't attempt to isolate the process inside a sandbox. None, } impl FromStr for SandboxMode { type Err = &'static str; fn from_str(s: &str) -> Result { match s.to_lowercase().as_str() { "namespace" => Ok(SandboxMode::Namespace), "chroot" => Ok(SandboxMode::Chroot), "none" => Ok(SandboxMode::None), _ => Err("Unknown sandbox mode"), } } } /// A helper for creating a sandbox for isolating the service. pub struct Sandbox { /// The directory that is going to be shared with the VM. The sandbox will be constructed on top /// of this directory. shared_dir: String, /// A `File` object for `/proc/self/fd` obtained from the sandboxed context. proc_self_fd: Option, /// A `File` object for `/proc/self/mountinfo` obtained from the sandboxed context. mountinfo_fd: Option, /// Mechanism to be used for setting up the sandbox. sandbox_mode: SandboxMode, /// UidMap to be used for `newuidmap(1)` command line arguments uid_map: Vec, /// GidMap to be used for `newgidmap(1)` command line arguments gid_map: Vec, } impl Sandbox { pub fn new( shared_dir: String, sandbox_mode: SandboxMode, uid_map: Vec, gid_map: Vec, ) -> io::Result { let shared_dir_rp = fs::canonicalize(shared_dir)?; let shared_dir_rp_str = shared_dir_rp .to_str() .ok_or_else(|| io::Error::from_raw_os_error(libc::EINVAL))?; Ok(Sandbox { shared_dir: shared_dir_rp_str.into(), proc_self_fd: None, mountinfo_fd: None, sandbox_mode, uid_map, gid_map, }) } // Make `self.shared_dir` our root directory, and get isolated file descriptors for // `/proc/self/fd` and '/proc/self/mountinfo`. // // This is based on virtiofsd's setup_namespaces() and setup_mounts(), and it's very similar to // the strategy used in containers. Consists on a careful sequence of mounts and bind-mounts to // ensure it's not possible to escape the sandbox through `self.shared_dir` nor the file // descriptor obtained for `/proc/self/fd`. // // It's ugly, but it's the only way until Linux implements a proper containerization API. fn setup_mounts(&mut self) -> Result<(), Error> { // Open an FD to `/proc/self` so we can later open `/proc/self/mountinfo`. // (If we opened `/proc/self/mountinfo` now, it would appear empty by the end of this // function, which is why we need to defer opening it until then.) let c_proc_self = CString::new("/proc/self").unwrap(); let proc_self_raw = unsafe { libc::open(c_proc_self.as_ptr(), libc::O_PATH) }; if proc_self_raw < 0 { return Err(Error::OpenProcSelf(std::io::Error::last_os_error())); } // Encapsulate the `/proc/self` FD in a `File` object so it is closed when this function // returns let proc_self = unsafe { File::from_raw_fd(proc_self_raw) }; // Ensure our mount changes don't affect the parent mount namespace. oslib::mount(None, "/", None, libc::MS_SLAVE | libc::MS_REC).map_err(Error::CleanMount)?; // Mount `/proc` in this context. oslib::mount( "proc".into(), "/proc", "proc".into(), libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_NOSUID | libc::MS_RELATIME, ) .map_err(Error::MountProc)?; // Bind-mount `/proc/self/fd` onto /proc preventing access to ancestor // directories. oslib::mount("/proc/self/fd".into(), "/proc", None, libc::MS_BIND) .map_err(Error::BindMountProcSelfFd)?; // Obtain a file descriptor to /proc/self/fd/ by opening bind-mounted /proc directory. let c_proc_dir = CString::new("/proc").unwrap(); let proc_self_fd = unsafe { libc::open(c_proc_dir.as_ptr(), libc::O_PATH) }; if proc_self_fd < 0 { return Err(Error::OpenProcSelfFd(std::io::Error::last_os_error())); } // Safe because we just opened this fd. self.proc_self_fd = Some(unsafe { File::from_raw_fd(proc_self_fd) }); // Bind-mount `self.shared_dir` on itself so we can use as new root on `pivot_root` syscall. oslib::mount( self.shared_dir.as_str().into(), self.shared_dir.as_str(), None, libc::MS_BIND | libc::MS_REC, ) .map_err(Error::BindMountSharedDir)?; // Get a file descriptor to our old root so we can reference it after switching root. let c_root_dir = CString::new("/").unwrap(); let oldroot_fd = unsafe { libc::open( c_root_dir.as_ptr(), libc::O_DIRECTORY | libc::O_RDONLY | libc::O_CLOEXEC, ) }; if oldroot_fd < 0 { return Err(Error::OpenOldRoot(std::io::Error::last_os_error())); } // Get a file descriptor to the new root so we can reference it after switching root. let c_shared_dir = CString::new(self.shared_dir.clone()).unwrap(); let newroot_fd = unsafe { libc::open( c_shared_dir.as_ptr(), libc::O_DIRECTORY | libc::O_RDONLY | libc::O_CLOEXEC, ) }; if newroot_fd < 0 { return Err(Error::OpenNewRoot(std::io::Error::last_os_error())); } // Change to new root directory to prepare for `pivot_root` syscall. oslib::fchdir(newroot_fd).map_err(Error::ChdirNewRoot)?; // Call to `pivot_root` using `.` as both new and old root. let c_current_dir = CString::new(".").unwrap(); let ret = unsafe { libc::syscall( libc::SYS_pivot_root, c_current_dir.as_ptr(), c_current_dir.as_ptr(), ) }; if ret < 0 { return Err(Error::PivotRoot(std::io::Error::last_os_error())); } // Change to old root directory to prepare for cleaning up and unmounting it. oslib::fchdir(oldroot_fd).map_err(Error::ChdirOldRoot)?; // Clean up old root to avoid mount namespace propagation. oslib::mount(None, ".", None, libc::MS_SLAVE | libc::MS_REC).map_err(Error::CleanMount)?; // Lazily unmount old root. oslib::umount2(".", libc::MNT_DETACH).map_err(Error::UmountOldRoot)?; // Change to new root. oslib::fchdir(newroot_fd).map_err(Error::ChdirNewRoot)?; // We no longer need these file descriptors, so close them. unsafe { libc::close(newroot_fd) }; unsafe { libc::close(oldroot_fd) }; // Open `/proc/self/mountinfo` now let c_mountinfo = CString::new("mountinfo").unwrap(); let mountinfo_fd = unsafe { libc::openat(proc_self.as_raw_fd(), c_mountinfo.as_ptr(), libc::O_RDONLY) }; if mountinfo_fd < 0 { return Err(Error::OpenMountinfo(std::io::Error::last_os_error())); } // Safe because we just opened this fd. self.mountinfo_fd = Some(unsafe { File::from_raw_fd(mountinfo_fd) }); Ok(()) } /// Sets mappings for the given uid and gid. fn setup_id_mappings( &self, uid_map: &[UidMap], gid_map: &[GidMap], pid: i32, ) -> Result<(), Error> { let current_uid = unsafe { libc::geteuid() }; let current_gid = unsafe { libc::getegid() }; // Take uid map or set up a 1-to-1 mapping for our current euid. let default_uid_map = vec![UidMap { outside_uid: current_uid, inside_uid: current_uid, count: 1, }]; let uid_map = if uid_map.is_empty() { &default_uid_map } else { uid_map }; // Take gid map or set up a 1-to-1 mapping for our current gid. let default_gid_map = vec![GidMap { outside_gid: current_gid, inside_gid: current_gid, count: 1, }]; let gid_map = if gid_map.is_empty() { &default_gid_map } else { gid_map }; // Unprivileged user can not set any mapping without any restriction. // Therefore, newuidmap/newgidmap is used instead of writing directly // into proc/[pid]/{uid,gid}_map if a potentially privileged action is // requested (outside {u,g}id != e{u,g}id or count > 1). if uid_map.len() != 1 || uid_map[0].outside_uid != current_uid || uid_map[0].count > 1 { let mut newuidmap = Command::new("newuidmap"); newuidmap.arg(pid.to_string()); for entry in uid_map.iter() { newuidmap.arg(entry.inside_uid.to_string()); newuidmap.arg(entry.outside_uid.to_string()); newuidmap.arg(entry.count.to_string()); } let output = newuidmap.output().map_err(|_| { Error::WriteUidMap(format!( "failed to execute newuidmap: {}", io::Error::last_os_error() )) })?; if !output.status.success() { return Err(Error::WriteUidMap( String::from_utf8_lossy(&output.stderr).to_string(), )); } } else { // Unprivileged part, we can driectly write to /proc/[pid]/uid_map. std::fs::write( format!("/proc/{pid}/uid_map"), format!("{} {} 1", uid_map[0].inside_uid, uid_map[0].outside_uid), ) .map_err(|e| Error::WriteUidMap(e.to_string()))?; } if gid_map.len() != 1 || gid_map[0].outside_gid != current_gid || gid_map[0].count > 1 { let mut newgidmap = Command::new("newgidmap"); newgidmap.arg(pid.to_string()); for entry in gid_map.iter() { newgidmap.arg(entry.inside_gid.to_string()); newgidmap.arg(entry.outside_gid.to_string()); newgidmap.arg(entry.count.to_string()); } let output = newgidmap.output().map_err(|_| { Error::WriteGidMap(format!( "failed to execute newgidmap: {}", io::Error::last_os_error() )) })?; if !output.status.success() { return Err(Error::WriteGidMap( String::from_utf8_lossy(&output.stderr).to_string(), )); } } else { // Unprivileged part, we can driectly write to /proc/[pid]/gid_map. std::fs::write(format!("/proc/{pid}/setgroups"), b"deny") .map_err(|e| Error::WriteGidMap(e.to_string()))?; std::fs::write( format!("/proc/{pid}/gid_map"), format!("{} {} 1", gid_map[0].inside_gid, gid_map[0].outside_gid), ) .map_err(|e| Error::WriteGidMap(e.to_string()))?; } Ok(()) } pub fn enter_namespace(&mut self, listener: Listener) -> Result { let uid = unsafe { libc::geteuid() }; let flags = if uid == 0 { libc::CLONE_NEWPID | libc::CLONE_NEWNS | libc::CLONE_NEWNET } else { // If running as an unprivileged user, rely on user_namespaces(7) for isolation. libc::CLONE_NEWPID | libc::CLONE_NEWNS | libc::CLONE_NEWNET | libc::CLONE_NEWUSER }; let (mut x_reader, mut x_writer) = oslib::pipe().unwrap(); let (mut y_reader, mut y_writer) = oslib::pipe().unwrap(); let pid = util::sfork().map_err(Error::Fork)?; let mut output = [0]; // First child is only responsible to setup id mapping // from outside of the main thread's namespace. // Pipe is used for synchronization between the main thread and the first child. // That will guarantee the mapping is done before the main thread gets running. if pid == 0 { // First child // Dropping the other end of the pipes drop(x_writer); drop(y_reader); // This is waiting until unshare() returns x_reader.read_exact(&mut output).unwrap(); assert_eq!(output[0], IdMapSetUpPipeMessage::Request as u8); // Setup uid/gid mappings if uid != 0 { let ppid = unsafe { libc::getppid() }; if let Err(error) = self.setup_id_mappings(&self.uid_map, &self.gid_map, ppid) { // We don't really need to close the pipes here, since the OS will close the FDs // after the process exits. But let's do it explicitly to signal an error to the // other end of the pipe. drop(x_reader); drop(y_writer); error!("sandbox: couldn't setup id mappings: {}", error); process::exit(1); }; } // Signal that mapping is done y_writer .write_all(&[IdMapSetUpPipeMessage::Done as u8]) .unwrap_or_else(|_| process::exit(1)); // Terminate this child process::exit(0); } else { // This is the parent let ret = unsafe { libc::unshare(flags) }; if ret != 0 { return Err(Error::Unshare(std::io::Error::last_os_error())); } // Dropping the other end of the pipes drop(x_reader); drop(y_writer); // Signal the first child to go ahead and setup the id mappings x_writer .write_all(&[IdMapSetUpPipeMessage::Request as u8]) .unwrap(); // Receive the signal that mapping is done. If the child process exits // before setting up the mapping, closing the pipe before sending the // message, `read_exact()` will fail with `UnexpectedEof`. y_reader .read_exact(&mut output) .unwrap_or_else(|_| process::exit(1)); assert_eq!(output[0], IdMapSetUpPipeMessage::Done as u8); let mut status = 0_i32; let _ = unsafe { libc::waitpid(pid, &mut status, 0) }; // Set the process inside the user namespace as root let mut ret = unsafe { libc::setresuid(0, 0, 0) }; if ret != 0 { warn!("Couldn't set the process uid as root: {}", ret); } ret = unsafe { libc::setresgid(0, 0, 0) }; if ret != 0 { warn!("Couldn't set the process gid as root: {}", ret); } let child = util::sfork().map_err(Error::Fork)?; if child == 0 { // Second child self.setup_mounts()?; Ok(listener) } else { // This is the parent // The child process drops the `vhost::Listener` after the first // `accept()`. However, since the parent just waits until the child // ends, keeping all the FDs open, as well as the socket's FD in a // listen state. This is problematic because nothing prevents a // miss-configured VMM to try to connect twice to the same socket // leaving the VMM waiting forever. So, let's close the listener // before waiting for the child. let fd = listener.as_raw_fd(); // `vhost::Listener` beside closing the FD, it will remove the socket, if dropped std::mem::forget(listener); // Let's close the FD without removing the socket file // SAFETY: `fd` is open and nobody owns it let fd = unsafe { OwnedFd::from_raw_fd(fd) }; drop(fd); util::wait_for_child(child); // This never returns. } } } pub fn enter_chroot(&mut self) -> Result<(), Error> { let c_proc_self_fd = CString::new("/proc/self/fd").unwrap(); let proc_self_fd = unsafe { libc::open(c_proc_self_fd.as_ptr(), libc::O_PATH) }; if proc_self_fd < 0 { return Err(Error::OpenProcSelfFd(std::io::Error::last_os_error())); } // Safe because we just opened this fd. self.proc_self_fd = Some(unsafe { File::from_raw_fd(proc_self_fd) }); let c_mountinfo = CString::new("/proc/self/mountinfo").unwrap(); let mountinfo_fd = unsafe { libc::open(c_mountinfo.as_ptr(), libc::O_RDONLY) }; if mountinfo_fd < 0 { return Err(Error::OpenMountinfo(std::io::Error::last_os_error())); } // Safe because we just opened this fd. self.mountinfo_fd = Some(unsafe { File::from_raw_fd(mountinfo_fd) }); let c_shared_dir = CString::new(self.shared_dir.clone()).unwrap(); let ret = unsafe { libc::chroot(c_shared_dir.as_ptr()) }; if ret != 0 { return Err(Error::Chroot(std::io::Error::last_os_error())); } let c_root_dir = CString::new("/").unwrap(); let ret = unsafe { libc::chdir(c_root_dir.as_ptr()) }; if ret != 0 { return Err(Error::ChrootChdir(std::io::Error::last_os_error())); } Ok(()) } fn must_drop_supplemental_groups(&self) -> Result { let uid = unsafe { libc::geteuid() }; if uid != 0 { return Ok(false); } // If we are running as root and the system does not support user namespaces, // we must drop supplemental groups. if !Path::new("/proc/self/ns/user").exists() { return Ok(true); } let uid_mmap_data = fs::read_to_string("/proc/self/uid_map").map_err(Error::DropSupplementalGroups)?; let uid_map: Vec<_> = uid_mmap_data.split_whitespace().collect(); let gid_map_data = fs::read_to_string("/proc/self/gid_map").map_err(Error::DropSupplementalGroups)?; let gid_map: Vec<_> = gid_map_data.split_whitespace().collect(); let setgroups = fs::read_to_string("/proc/self/setgroups").map_err(Error::DropSupplementalGroups)?; // A single line mapping only has 3 fields, and the 'count' field should // be 1. let single_uid_mapping = uid_map.len() == 3 && uid_map[2] == "1"; let single_gid_mapping = gid_map.len() == 3 && gid_map[2] == "1"; Ok(setgroups.trim() != "deny" || !single_uid_mapping || !single_gid_mapping) } fn drop_supplemental_groups(&self) -> Result<(), Error> { let ngroups = unsafe { libc::getgroups(0, std::ptr::null_mut()) }; if ngroups < 0 { return Err(Error::GetSupplementalGroups(std::io::Error::last_os_error())); } else if ngroups != 0 { let ret = unsafe { libc::setgroups(0, std::ptr::null()) }; if ret != 0 { return Err(Error::DropSupplementalGroups( std::io::Error::last_os_error(), )); } } Ok(()) } /// Set up sandbox, pub fn enter(&mut self, listener: Listener) -> Result { let uid = unsafe { libc::geteuid() }; if uid != 0 && self.sandbox_mode == SandboxMode::Chroot { return Err(Error::SandboxModeInvalidUID); } if !self.uid_map.is_empty() && (uid == 0 || self.sandbox_mode != SandboxMode::Namespace) { return Err(Error::SandboxModeInvalidUidMap); } if !self.gid_map.is_empty() && (uid == 0 || self.sandbox_mode != SandboxMode::Namespace) { return Err(Error::SandboxModeInvalidGidMap); } // We must drop supplemental groups membership if we support switching // between arbitrary uids/gids, unless the following conditions are met: // we're not running as root or we are inside a user namespace with only // one uid and gid mapping and '/proc/self/setgroups' is equal to // "deny". In both of these cases, no arbitrary uid/gid switching is // possible and thus there's no need to drop supplemental groups. In // both of these scenarios calling setgroups() is also not allowed so we // avoid calling it since we know it will return a privilege error. let must_drop_supplemental_groups = match self.must_drop_supplemental_groups() { Ok(must_drop) => must_drop, Err(error) => { warn!( "Failed to determine whether supplemental groups must be dropped: {error}; \ defaulting to trying to drop supplemental groups" ); true } }; if must_drop_supplemental_groups { self.drop_supplemental_groups()?; } match self.sandbox_mode { SandboxMode::Namespace => self.enter_namespace(listener), SandboxMode::Chroot => self.enter_chroot().and(Ok(listener)), SandboxMode::None => Ok(listener), } } pub fn get_proc_self_fd(&mut self) -> Option { self.proc_self_fd.take() } pub fn get_mountinfo_fd(&mut self) -> Option { self.mountinfo_fd.take() } pub fn get_root_dir(&self) -> String { match self.sandbox_mode { SandboxMode::Namespace | SandboxMode::Chroot => "/".to_string(), SandboxMode::None => self.shared_dir.clone(), } } /// Return the prefix to strip from /proc/self/mountinfo entries to get paths that are actually /// accessible in our sandbox pub fn get_mountinfo_prefix(&self) -> Option { match self.sandbox_mode { SandboxMode::Namespace | SandboxMode::None => None, SandboxMode::Chroot => Some(self.shared_dir.clone()), } } } virtiofsd-1.13.0/src/seccomp.rs000064400000000000000000000171421046102023000145120ustar 00000000000000// Copyright 2020 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use libseccomp_sys::{ seccomp_init, seccomp_load, seccomp_release, seccomp_rule_add, SCMP_ACT_ALLOW, SCMP_ACT_KILL_PROCESS, SCMP_ACT_LOG, SCMP_ACT_TRAP, }; use std::convert::TryInto; use std::{error, fmt}; #[derive(Debug)] pub enum Error { /// Error allowing a syscall AllowSeccompSyscall(i32), /// Cannot load seccomp filter LoadSeccompFilter, /// Cannot initialize seccomp context InitSeccompContext, } impl error::Error for Error {} impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "virtiofsd_seccomp_error: {self:?}") } } #[derive(Copy, Clone, Debug)] pub enum SeccompAction { Allow, Kill, Log, Trap, } impl From for u32 { fn from(action: SeccompAction) -> u32 { match action { SeccompAction::Allow => SCMP_ACT_ALLOW, SeccompAction::Kill => SCMP_ACT_KILL_PROCESS, SeccompAction::Log => SCMP_ACT_LOG, SeccompAction::Trap => SCMP_ACT_TRAP, } } } macro_rules! allow_syscall { ($ctx:ident, $syscall:expr) => { let syscall_nr: i32 = $syscall.try_into().unwrap(); let ret = unsafe { seccomp_rule_add($ctx, SCMP_ACT_ALLOW, syscall_nr, 0) }; if ret != 0 { return Err(Error::AllowSeccompSyscall(syscall_nr)); } }; } pub fn enable_seccomp(action: SeccompAction, allow_remote_logging: bool) -> Result<(), Error> { let ctx = unsafe { seccomp_init(action.into()) }; if ctx.is_null() { return Err(Error::InitSeccompContext); } allow_syscall!(ctx, libc::SYS_accept4); allow_syscall!(ctx, libc::SYS_brk); allow_syscall!(ctx, libc::SYS_capget); // For CAP_FSETID allow_syscall!(ctx, libc::SYS_capset); allow_syscall!(ctx, libc::SYS_clock_gettime); allow_syscall!(ctx, libc::SYS_clone); allow_syscall!(ctx, libc::SYS_clone3); allow_syscall!(ctx, libc::SYS_close); allow_syscall!(ctx, libc::SYS_copy_file_range); allow_syscall!(ctx, libc::SYS_dup); #[cfg(any( target_arch = "x86_64", target_arch = "s390x", target_arch = "powerpc64" ))] allow_syscall!(ctx, libc::SYS_epoll_create); allow_syscall!(ctx, libc::SYS_epoll_create1); allow_syscall!(ctx, libc::SYS_epoll_ctl); allow_syscall!(ctx, libc::SYS_epoll_pwait); #[cfg(any( target_arch = "x86_64", target_arch = "s390x", target_arch = "powerpc64" ))] allow_syscall!(ctx, libc::SYS_epoll_wait); allow_syscall!(ctx, libc::SYS_eventfd2); allow_syscall!(ctx, libc::SYS_exit); allow_syscall!(ctx, libc::SYS_exit_group); allow_syscall!(ctx, libc::SYS_fallocate); allow_syscall!(ctx, libc::SYS_fchdir); allow_syscall!(ctx, libc::SYS_fchmod); allow_syscall!(ctx, libc::SYS_fchmodat); allow_syscall!(ctx, libc::SYS_fchownat); allow_syscall!(ctx, libc::SYS_fcntl); allow_syscall!(ctx, libc::SYS_fdatasync); allow_syscall!(ctx, libc::SYS_fgetxattr); allow_syscall!(ctx, libc::SYS_flistxattr); allow_syscall!(ctx, libc::SYS_flock); allow_syscall!(ctx, libc::SYS_fremovexattr); allow_syscall!(ctx, libc::SYS_fsetxattr); #[cfg(not(target_arch = "loongarch64"))] allow_syscall!(ctx, libc::SYS_fstat); #[cfg(any(target_arch = "s390x", target_arch = "powerpc64"))] allow_syscall!(ctx, libc::SYS_fstatfs64); allow_syscall!(ctx, libc::SYS_fstatfs); allow_syscall!(ctx, libc::SYS_fsync); allow_syscall!(ctx, libc::SYS_ftruncate); allow_syscall!(ctx, libc::SYS_futex); #[cfg(any( target_arch = "x86_64", target_arch = "s390x", target_arch = "powerpc64" ))] allow_syscall!(ctx, libc::SYS_getdents); allow_syscall!(ctx, libc::SYS_getdents64); allow_syscall!(ctx, libc::SYS_getegid); allow_syscall!(ctx, libc::SYS_geteuid); allow_syscall!(ctx, libc::SYS_getpid); allow_syscall!(ctx, libc::SYS_getrandom); allow_syscall!(ctx, libc::SYS_gettid); allow_syscall!(ctx, libc::SYS_gettimeofday); allow_syscall!(ctx, libc::SYS_getxattr); allow_syscall!(ctx, libc::SYS_linkat); allow_syscall!(ctx, libc::SYS_listxattr); allow_syscall!(ctx, libc::SYS_lseek); allow_syscall!(ctx, libc::SYS_madvise); allow_syscall!(ctx, libc::SYS_membarrier); allow_syscall!(ctx, libc::SYS_mkdirat); allow_syscall!(ctx, libc::SYS_mknodat); allow_syscall!(ctx, libc::SYS_mmap); allow_syscall!(ctx, libc::SYS_mprotect); allow_syscall!(ctx, libc::SYS_mremap); allow_syscall!(ctx, libc::SYS_munmap); allow_syscall!(ctx, libc::SYS_name_to_handle_at); #[cfg(not(target_arch = "loongarch64"))] allow_syscall!(ctx, libc::SYS_newfstatat); #[cfg(target_arch = "powerpc64")] allow_syscall!(ctx, libc::SYS__llseek); #[cfg(any( target_arch = "x86_64", target_arch = "s390x", target_arch = "powerpc64" ))] allow_syscall!(ctx, libc::SYS_open); allow_syscall!(ctx, libc::SYS_openat); allow_syscall!(ctx, libc::SYS_openat2); allow_syscall!(ctx, libc::SYS_open_by_handle_at); allow_syscall!(ctx, libc::SYS_prctl); // TODO restrict to just PR_SET_NAME? allow_syscall!(ctx, libc::SYS_preadv); allow_syscall!(ctx, libc::SYS_pread64); allow_syscall!(ctx, libc::SYS_pwritev2); allow_syscall!(ctx, libc::SYS_pwrite64); allow_syscall!(ctx, libc::SYS_read); allow_syscall!(ctx, libc::SYS_readlinkat); allow_syscall!(ctx, libc::SYS_recvmsg); #[cfg(not(any(target_arch = "loongarch64", target_arch = "riscv64")))] allow_syscall!(ctx, libc::SYS_renameat); allow_syscall!(ctx, libc::SYS_renameat2); allow_syscall!(ctx, libc::SYS_removexattr); #[cfg(target_env = "gnu")] allow_syscall!(ctx, libc::SYS_rseq); allow_syscall!(ctx, libc::SYS_rt_sigaction); allow_syscall!(ctx, libc::SYS_rt_sigprocmask); allow_syscall!(ctx, libc::SYS_rt_sigreturn); allow_syscall!(ctx, libc::SYS_sched_getaffinity); // used by thread_pool allow_syscall!(ctx, libc::SYS_sched_yield); allow_syscall!(ctx, libc::SYS_sendmsg); allow_syscall!(ctx, libc::SYS_setgroups); allow_syscall!(ctx, libc::SYS_setresgid); allow_syscall!(ctx, libc::SYS_setresuid); //allow_syscall!(ctx, libc::SYS_setresgid32); Needed on some platforms, //allow_syscall!(ctx, libc::SYS_setresuid32); Needed on some platforms allow_syscall!(ctx, libc::SYS_set_robust_list); allow_syscall!(ctx, libc::SYS_setxattr); allow_syscall!(ctx, libc::SYS_sigaltstack); #[cfg(target_arch = "s390x")] allow_syscall!(ctx, libc::SYS_sigreturn); allow_syscall!(ctx, libc::SYS_statx); allow_syscall!(ctx, libc::SYS_symlinkat); allow_syscall!(ctx, libc::SYS_syncfs); #[cfg(target_arch = "x86_64")] allow_syscall!(ctx, libc::SYS_time); // Rarely needed, except on static builds allow_syscall!(ctx, libc::SYS_tgkill); allow_syscall!(ctx, libc::SYS_umask); #[cfg(any( target_arch = "x86_64", target_arch = "s390x", target_arch = "powerpc64" ))] allow_syscall!(ctx, libc::SYS_unlink); allow_syscall!(ctx, libc::SYS_unlinkat); allow_syscall!(ctx, libc::SYS_unshare); allow_syscall!(ctx, libc::SYS_utimensat); allow_syscall!(ctx, libc::SYS_write); allow_syscall!(ctx, libc::SYS_writev); if allow_remote_logging { allow_syscall!(ctx, libc::SYS_sendto); // Required by syslog } let ret = unsafe { seccomp_load(ctx) }; if ret != 0 { return Err(Error::LoadSeccompFilter); } unsafe { seccomp_release(ctx) }; Ok(()) } virtiofsd-1.13.0/src/server.rs000064400000000000000000001642621046102023000143750ustar 00000000000000// Copyright 2019 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use crate::descriptor_utils::{Reader, Writer}; use crate::filesystem::{ Context, DirEntry, DirectoryIterator, Entry, Extensions, FileSystem, GetxattrReply, ListxattrReply, SecContext, SerializableFileSystem, ZeroCopyReader, ZeroCopyWriter, }; use crate::fuse::*; use crate::passthrough::util::einval; use crate::soft_idmap::GuestGid; use crate::{oslib, Error, Result}; use std::convert::{TryFrom, TryInto}; use std::ffi::{CStr, CString}; use std::fs::File; use std::io::{self, Read, Write}; use std::mem::size_of; use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::Arc; use std::time::Duration; use vm_memory::ByteValued; const FUSE_BUFFER_HEADER_SIZE: u32 = 0x1000; const MAX_BUFFER_SIZE: u32 = 1 << 20; const DIRENT_PADDING: [u8; 8] = [0; 8]; const CURRENT_DIR_CSTR: &[u8] = b"."; const PARENT_DIR_CSTR: &[u8] = b".."; struct ZcReader<'a>(Reader<'a>); impl ZeroCopyReader for ZcReader<'_> { fn write_to_file_at( &mut self, f: &File, mut count: usize, mut off: u64, flags: Option, ) -> io::Result { let start = off; while count > 0 { let written = self.0.write_to_file_at(f, count, off, flags)?; if written == 0 { break; } off = off .checked_add(written as u64) .ok_or_else(|| io::Error::other("Write wrap-around"))?; count = count .checked_sub(written) .ok_or_else(|| io::Error::other("Write operation wrote more than requested"))?; } // Must fit: Cannot be greater than `count` originally Ok((off - start) as usize) } } struct ZcWriter<'a>(Writer<'a>); impl ZeroCopyWriter for ZcWriter<'_> { fn read_from_file_at(&mut self, f: &File, mut count: usize, mut off: u64) -> io::Result { let start = off; while count > 0 { let read = self.0.read_from_file_at(f, count, off)?; if read == 0 { break; } off = off .checked_add(read as u64) .ok_or_else(|| io::Error::other("Read wrap-around"))?; count = count .checked_sub(read) .ok_or_else(|| io::Error::other("Read operation wrote more than requested"))?; } // Must fit: Cannot be greater than `count` originally Ok((off - start) as usize) } } pub struct Server { fs: F, options: AtomicU64, } impl Server { pub fn new(fs: F) -> Server { Server { fs, options: AtomicU64::new(FsOptions::empty().bits()), } } #[allow(clippy::cognitive_complexity)] pub fn handle_message( &self, mut r: Reader, w: Writer, vu_req: Option<&mut T>, ) -> Result { let in_header: InHeader = r.read_obj().map_err(Error::DecodeMessage)?; if in_header.len > (MAX_BUFFER_SIZE + FUSE_BUFFER_HEADER_SIZE) { return reply_error( io::Error::from_raw_os_error(libc::ENOMEM), in_header.unique, w, ); } if let Ok(opcode) = Opcode::try_from(in_header.opcode) { debug!( "Received request: opcode={:?} ({}), inode={}, unique={}, pid={}", opcode, in_header.opcode, in_header.nodeid, in_header.unique, in_header.pid ); match opcode { Opcode::Lookup => self.lookup(in_header, r, w), Opcode::Forget => self.forget(in_header, r), // No reply. Opcode::Getattr => self.getattr(in_header, r, w), Opcode::Setattr => self.setattr(in_header, r, w), Opcode::Readlink => self.readlink(in_header, w), Opcode::Symlink => self.symlink(in_header, r, w), Opcode::Mknod => self.mknod(in_header, r, w), Opcode::Mkdir => self.mkdir(in_header, r, w), Opcode::Unlink => self.unlink(in_header, r, w), Opcode::Rmdir => self.rmdir(in_header, r, w), Opcode::Rename => self.rename(in_header, r, w), Opcode::Link => self.link(in_header, r, w), Opcode::Open => self.open(in_header, r, w), Opcode::Read => self.read(in_header, r, w), Opcode::Write => self.write(in_header, r, w), Opcode::Statfs => self.statfs(in_header, w), Opcode::Release => self.release(in_header, r, w), Opcode::Fsync => self.fsync(in_header, r, w), Opcode::Setxattr => self.setxattr(in_header, r, w), Opcode::Getxattr => self.getxattr(in_header, r, w), Opcode::Listxattr => self.listxattr(in_header, r, w), Opcode::Removexattr => self.removexattr(in_header, r, w), Opcode::Flush => self.flush(in_header, r, w), Opcode::Init => self.init(in_header, r, w), Opcode::Opendir => self.opendir(in_header, r, w), Opcode::Readdir => self.readdir(in_header, r, w), Opcode::Releasedir => self.releasedir(in_header, r, w), Opcode::Fsyncdir => self.fsyncdir(in_header, r, w), Opcode::Getlk => self.getlk(in_header, r, w), Opcode::Setlk => self.setlk(in_header, r, w), Opcode::Setlkw => self.setlkw(in_header, r, w), Opcode::Access => self.access(in_header, r, w), Opcode::Create => self.create(in_header, r, w), Opcode::Interrupt => Ok(self.interrupt(in_header)), Opcode::Bmap => self.bmap(in_header, r, w), Opcode::Destroy => Ok(self.destroy()), Opcode::Ioctl => self.ioctl(in_header, r, w), Opcode::Poll => self.poll(in_header, r, w), Opcode::NotifyReply => self.notify_reply(in_header, r, w), Opcode::BatchForget => self.batch_forget(in_header, r, w), Opcode::Fallocate => self.fallocate(in_header, r, w), Opcode::Readdirplus => self.readdirplus(in_header, r, w), Opcode::Rename2 => self.rename2(in_header, r, w), Opcode::Lseek => self.lseek(in_header, r, w), Opcode::CopyFileRange => self.copyfilerange(in_header, r, w), Opcode::SetupMapping => self.setupmapping(in_header, r, w, vu_req), Opcode::RemoveMapping => self.removemapping(in_header, r, w, vu_req), Opcode::Syncfs => self.syncfs(in_header, w), Opcode::TmpFile => self.tmpfile(in_header, r, w), } } else { debug!( "Received unknown request: opcode={}, inode={}", in_header.opcode, in_header.nodeid ); reply_error( io::Error::from_raw_os_error(libc::ENOSYS), in_header.unique, w, ) } } fn setupmapping( &self, in_header: InHeader, _r: Reader, w: Writer, _vu_req: Option<&mut T>, ) -> Result { reply_error( io::Error::from_raw_os_error(libc::ENOSYS), in_header.unique, w, ) } fn removemapping( &self, in_header: InHeader, _r: Reader, w: Writer, _vu_req: Option<&mut T>, ) -> Result { reply_error( io::Error::from_raw_os_error(libc::ENOSYS), in_header.unique, w, ) } fn lookup(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let namelen = (in_header.len as usize) .checked_sub(size_of::()) .ok_or(Error::InvalidHeaderLength)?; let mut buf = vec![0u8; namelen]; r.read_exact(&mut buf).map_err(Error::DecodeMessage)?; let name = bytes_to_cstr(buf.as_ref())?; match self .fs .lookup(Context::from(in_header), in_header.nodeid.into(), name) { Ok(entry) => { let out = EntryOut::from(entry); reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn forget(&self, in_header: InHeader, mut r: Reader) -> Result { let ForgetIn { nlookup } = r.read_obj().map_err(Error::DecodeMessage)?; self.fs .forget(Context::from(in_header), in_header.nodeid.into(), nlookup); // There is no reply for forget messages. Ok(0) } fn getattr(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let GetattrIn { flags, fh, .. } = r.read_obj().map_err(Error::DecodeMessage)?; let handle = if (flags & GETATTR_FH) != 0 { Some(fh.into()) } else { None }; match self .fs .getattr(Context::from(in_header), in_header.nodeid.into(), handle) { Ok((attr, timeout)) => { let out = AttrOut { attr_valid: timeout.as_secs(), attr_valid_nsec: timeout.subsec_nanos(), dummy: 0, attr, }; reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn setattr(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let setattr_in: SetattrIn = r.read_obj().map_err(Error::DecodeMessage)?; let handle = if setattr_in.valid & FATTR_FH != 0 { Some(setattr_in.fh.into()) } else { None }; let valid = SetattrValid::from_bits_truncate(setattr_in.valid); match self.fs.setattr( Context::from(in_header), in_header.nodeid.into(), setattr_in, handle, valid, ) { Ok((attr, timeout)) => { let out = AttrOut { attr_valid: timeout.as_secs(), attr_valid_nsec: timeout.subsec_nanos(), dummy: 0, attr, }; reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn readlink(&self, in_header: InHeader, w: Writer) -> Result { match self .fs .readlink(Context::from(in_header), in_header.nodeid.into()) { Ok(linkname) => { // We need to disambiguate the option type here even though it is `None`. reply_ok(None::, Some(&linkname), in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn symlink(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { // Unfortunately the name and linkname are encoded one after another and // separated by a nul character. let len = (in_header.len as usize) .checked_sub(size_of::()) .ok_or(Error::InvalidHeaderLength)?; let mut buf = vec![0; len]; r.read_exact(&mut buf).map_err(Error::DecodeMessage)?; let mut components = buf.split_inclusive(|c| *c == b'\0'); let name = components.next().ok_or(Error::MissingParameter)?; let linkname = components.next().ok_or(Error::MissingParameter)?; let options = FsOptions::from_bits_truncate(self.options.load(Ordering::Relaxed)); let extensions = get_extensions(options, name.len() + linkname.len(), buf.as_slice())?; match self.fs.symlink( Context::from(in_header), bytes_to_cstr(linkname)?, in_header.nodeid.into(), bytes_to_cstr(name)?, extensions, ) { Ok(entry) => { let out = EntryOut::from(entry); reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn mknod(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let MknodIn { mode, rdev, umask, .. } = r.read_obj().map_err(Error::DecodeMessage)?; let remaining_len = (in_header.len as usize) .checked_sub(size_of::()) .and_then(|l| l.checked_sub(size_of::())) .ok_or(Error::InvalidHeaderLength)?; let mut buf = vec![0; remaining_len]; r.read_exact(&mut buf).map_err(Error::DecodeMessage)?; let mut components = buf.split_inclusive(|c| *c == b'\0'); let name = components.next().ok_or(Error::MissingParameter)?; let options = FsOptions::from_bits_truncate(self.options.load(Ordering::Relaxed)); let extensions = get_extensions(options, name.len(), buf.as_slice())?; match self.fs.mknod( Context::from(in_header), in_header.nodeid.into(), bytes_to_cstr(name)?, mode, rdev, umask, extensions, ) { Ok(entry) => { let out = EntryOut::from(entry); reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn mkdir(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let MkdirIn { mode, umask } = r.read_obj().map_err(Error::DecodeMessage)?; let remaining_len = (in_header.len as usize) .checked_sub(size_of::()) .and_then(|l| l.checked_sub(size_of::())) .ok_or(Error::InvalidHeaderLength)?; let mut buf = vec![0; remaining_len]; r.read_exact(&mut buf).map_err(Error::DecodeMessage)?; let mut components = buf.split_inclusive(|c| *c == b'\0'); let name = components.next().ok_or(Error::MissingParameter)?; let options = FsOptions::from_bits_truncate(self.options.load(Ordering::Relaxed)); let extensions = get_extensions(options, name.len(), buf.as_slice())?; match self.fs.mkdir( Context::from(in_header), in_header.nodeid.into(), bytes_to_cstr(name)?, mode, umask, extensions, ) { Ok(entry) => { let out = EntryOut::from(entry); reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn unlink(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let namelen = (in_header.len as usize) .checked_sub(size_of::()) .ok_or(Error::InvalidHeaderLength)?; let mut name = vec![0; namelen]; r.read_exact(&mut name).map_err(Error::DecodeMessage)?; match self.fs.unlink( Context::from(in_header), in_header.nodeid.into(), bytes_to_cstr(&name)?, ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn rmdir(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let namelen = (in_header.len as usize) .checked_sub(size_of::()) .ok_or(Error::InvalidHeaderLength)?; let mut name = vec![0; namelen]; r.read_exact(&mut name).map_err(Error::DecodeMessage)?; match self.fs.rmdir( Context::from(in_header), in_header.nodeid.into(), bytes_to_cstr(&name)?, ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn do_rename( &self, in_header: InHeader, msg_size: usize, newdir: u64, flags: u32, mut r: Reader, w: Writer, ) -> Result { let buflen = (in_header.len as usize) .checked_sub(size_of::()) .and_then(|l| l.checked_sub(msg_size)) .ok_or(Error::InvalidHeaderLength)?; let mut buf = vec![0; buflen]; r.read_exact(&mut buf).map_err(Error::DecodeMessage)?; // We want to include the '\0' byte in the first slice. let split_pos = buf .iter() .position(|c| *c == b'\0') .map(|p| p + 1) .ok_or(Error::MissingParameter)?; let (oldname, newname) = buf.split_at(split_pos); match self.fs.rename( Context::from(in_header), in_header.nodeid.into(), bytes_to_cstr(oldname)?, newdir.into(), bytes_to_cstr(newname)?, flags, ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn rename(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let RenameIn { newdir } = r.read_obj().map_err(Error::DecodeMessage)?; self.do_rename(in_header, size_of::(), newdir, 0, r, w) } fn rename2(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let Rename2In { newdir, flags, .. } = r.read_obj().map_err(Error::DecodeMessage)?; let flags = flags & (libc::RENAME_EXCHANGE | libc::RENAME_NOREPLACE | libc::RENAME_WHITEOUT); self.do_rename(in_header, size_of::(), newdir, flags, r, w) } fn link(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let LinkIn { oldnodeid } = r.read_obj().map_err(Error::DecodeMessage)?; let namelen = (in_header.len as usize) .checked_sub(size_of::()) .and_then(|l| l.checked_sub(size_of::())) .ok_or(Error::InvalidHeaderLength)?; let mut name = vec![0; namelen]; r.read_exact(&mut name).map_err(Error::DecodeMessage)?; match self.fs.link( Context::from(in_header), oldnodeid.into(), in_header.nodeid.into(), bytes_to_cstr(&name)?, ) { Ok(entry) => { let out = EntryOut::from(entry); reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn open(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let OpenIn { flags, open_flags, .. } = r.read_obj().map_err(Error::DecodeMessage)?; let kill_priv = open_flags & OPEN_KILL_SUIDGID != 0; match self.fs.open( Context::from(in_header), in_header.nodeid.into(), kill_priv, flags, ) { Ok((handle, opts)) => { let out = OpenOut { fh: handle.map(Into::into).unwrap_or(0), open_flags: opts.bits(), ..Default::default() }; reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn read(&self, in_header: InHeader, mut r: Reader, mut w: Writer) -> Result { let ReadIn { fh, offset, size, read_flags, lock_owner, flags, .. } = r.read_obj().map_err(Error::DecodeMessage)?; let owner = if read_flags & READ_LOCKOWNER != 0 { Some(lock_owner) } else { None }; // Split the writer into 2 pieces: one for the `OutHeader` and the rest for the data. let data_writer = ZcWriter(w.split_at(size_of::()).unwrap()); match self.fs.read( Context::from(in_header), in_header.nodeid.into(), fh.into(), data_writer, size, offset, owner, flags, ) { Ok(count) => { // Don't use `reply_ok` because we need to set a custom size length for the // header. let out = OutHeader { len: (size_of::() + count) as u32, error: 0, unique: in_header.unique, }; debug!("Replying OK, header: {:?}", out); w.write_all(out.as_slice()).map_err(Error::EncodeMessage)?; Ok(out.len as usize) } Err(e) => reply_error(e, in_header.unique, w), } } fn write(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let WriteIn { fh, offset, size, write_flags, lock_owner, flags, .. } = r.read_obj().map_err(Error::DecodeMessage)?; let owner = if write_flags & WRITE_LOCKOWNER != 0 { Some(lock_owner) } else { None }; let delayed_write = write_flags & WRITE_CACHE != 0; let kill_priv = write_flags & WRITE_KILL_PRIV != 0; let data_reader = ZcReader(r); match self.fs.write( Context::from(in_header), in_header.nodeid.into(), fh.into(), data_reader, size, offset, owner, delayed_write, kill_priv, flags, ) { Ok(count) => { let out = WriteOut { size: count as u32, ..Default::default() }; reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn statfs(&self, in_header: InHeader, w: Writer) -> Result { match self .fs .statfs(Context::from(in_header), in_header.nodeid.into()) { Ok(st) => reply_ok(Some(Kstatfs::from(st)), None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn release(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let ReleaseIn { fh, flags, release_flags, lock_owner, } = r.read_obj().map_err(Error::DecodeMessage)?; let flush = release_flags & RELEASE_FLUSH != 0; let flock_release = release_flags & RELEASE_FLOCK_UNLOCK != 0; let lock_owner = if flush || flock_release { Some(lock_owner) } else { None }; match self.fs.release( Context::from(in_header), in_header.nodeid.into(), flags, fh.into(), flush, flock_release, lock_owner, ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn fsync(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let FsyncIn { fh, fsync_flags, .. } = r.read_obj().map_err(Error::DecodeMessage)?; let datasync = fsync_flags & 0x1 != 0; match self.fs.fsync( Context::from(in_header), in_header.nodeid.into(), datasync, fh.into(), ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn setxattr(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let options = FsOptions::from_bits_truncate(self.options.load(Ordering::Relaxed)); let ( SetxattrIn { size, flags, setxattr_flags, .. }, setxattrin_size, ) = if options.contains(FsOptions::SETXATTR_EXT) { ( r.read_obj().map_err(Error::DecodeMessage)?, size_of::(), ) } else { let SetxattrInCompat { size, flags } = r.read_obj().map_err(Error::DecodeMessage)?; ( SetxattrIn { size, flags, setxattr_flags: 0, padding: 0, }, size_of::(), ) }; // The name and value and encoded one after another and separated by a '\0' character. let len = (in_header.len as usize) .checked_sub(size_of::()) .and_then(|l| l.checked_sub(setxattrin_size)) .ok_or(Error::InvalidHeaderLength)?; let mut buf = vec![0; len]; r.read_exact(&mut buf).map_err(Error::DecodeMessage)?; // We want to include the '\0' byte in the first slice. let split_pos = buf .iter() .position(|c| *c == b'\0') .map(|p| p + 1) .ok_or(Error::MissingParameter)?; let (name, value) = buf.split_at(split_pos); if size != value.len() as u32 { return Err(Error::InvalidXattrSize((size, value.len()))); } match self.fs.setxattr( Context::from(in_header), in_header.nodeid.into(), bytes_to_cstr(name)?, value, flags, SetxattrFlags::from_bits_truncate(setxattr_flags), ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn getxattr(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let GetxattrIn { size, .. } = r.read_obj().map_err(Error::DecodeMessage)?; let namelen = (in_header.len as usize) .checked_sub(size_of::()) .and_then(|l| l.checked_sub(size_of::())) .ok_or(Error::InvalidHeaderLength)?; let mut name = vec![0; namelen]; r.read_exact(&mut name).map_err(Error::DecodeMessage)?; if size > MAX_BUFFER_SIZE { return reply_error( io::Error::from_raw_os_error(libc::ENOMEM), in_header.unique, w, ); } match self.fs.getxattr( Context::from(in_header), in_header.nodeid.into(), bytes_to_cstr(&name)?, size, ) { Ok(GetxattrReply::Value(val)) => reply_ok(None::, Some(&val), in_header.unique, w), Ok(GetxattrReply::Count(count)) => { let out = GetxattrOut { size: count, ..Default::default() }; reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn listxattr(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let GetxattrIn { size, .. } = r.read_obj().map_err(Error::DecodeMessage)?; if size > MAX_BUFFER_SIZE { return reply_error( io::Error::from_raw_os_error(libc::ENOMEM), in_header.unique, w, ); } match self .fs .listxattr(Context::from(in_header), in_header.nodeid.into(), size) { Ok(ListxattrReply::Names(val)) => reply_ok(None::, Some(&val), in_header.unique, w), Ok(ListxattrReply::Count(count)) => { let out = GetxattrOut { size: count, ..Default::default() }; reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn removexattr(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let namelen = (in_header.len as usize) .checked_sub(size_of::()) .ok_or(Error::InvalidHeaderLength)?; let mut buf = vec![0; namelen]; r.read_exact(&mut buf).map_err(Error::DecodeMessage)?; let name = bytes_to_cstr(&buf)?; match self .fs .removexattr(Context::from(in_header), in_header.nodeid.into(), name) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn flush(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let FlushIn { fh, lock_owner, .. } = r.read_obj().map_err(Error::DecodeMessage)?; match self.fs.flush( Context::from(in_header), in_header.nodeid.into(), fh.into(), lock_owner, ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn init(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let InitInCompat { major, minor, max_readahead, flags, } = r.read_obj().map_err(Error::DecodeMessage)?; let options = FsOptions::from_bits_truncate(flags as u64); let InitInExt { flags2, .. } = if options.contains(FsOptions::INIT_EXT) { r.read_obj().map_err(Error::DecodeMessage)? } else { InitInExt::default() }; if major < KERNEL_VERSION { error!("Unsupported fuse protocol version: {}.{}", major, minor); return reply_error( io::Error::from_raw_os_error(libc::EPROTO), in_header.unique, w, ); } if major > KERNEL_VERSION { // Wait for the kernel to reply back with a 7.X version. let out = InitOut { major: KERNEL_VERSION, minor: KERNEL_MINOR_VERSION, ..Default::default() }; return reply_ok(Some(out), None, in_header.unique, w); } if minor < MIN_KERNEL_MINOR_VERSION { error!( "Unsupported fuse protocol minor version: {}.{}", major, minor ); return reply_error( io::Error::from_raw_os_error(libc::EPROTO), in_header.unique, w, ); } // These fuse features are supported by this server by default. let supported = FsOptions::ASYNC_READ | FsOptions::PARALLEL_DIROPS | FsOptions::BIG_WRITES | FsOptions::AUTO_INVAL_DATA | FsOptions::ASYNC_DIO | FsOptions::HAS_IOCTL_DIR | FsOptions::ATOMIC_O_TRUNC | FsOptions::MAX_PAGES | FsOptions::SUBMOUNTS | FsOptions::INIT_EXT | FsOptions::CREATE_SUPP_GROUP | FsOptions::ALLOW_IDMAP; let flags_64 = ((flags2 as u64) << 32) | (flags as u64); let capable = FsOptions::from_bits_truncate(flags_64); let page_size: u32 = unsafe { libc::sysconf(libc::_SC_PAGESIZE).try_into().unwrap() }; let max_pages = ((MAX_BUFFER_SIZE - 1) / page_size) + 1; match self.fs.init(capable) { Ok(want) => { let enabled = (capable & (want | supported)).bits(); self.options.store(enabled, Ordering::Relaxed); let out = InitOut { major: KERNEL_VERSION, minor: KERNEL_MINOR_VERSION, max_readahead, flags: enabled as u32, max_background: u16::MAX, congestion_threshold: (u16::MAX / 4) * 3, max_write: MAX_BUFFER_SIZE, time_gran: 1, // nanoseconds max_pages: max_pages.try_into().unwrap(), map_alignment: 0, flags2: (enabled >> 32) as u32, ..Default::default() }; reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn opendir(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let OpenIn { flags, .. } = r.read_obj().map_err(Error::DecodeMessage)?; match self .fs .opendir(Context::from(in_header), in_header.nodeid.into(), flags) { Ok((handle, opts)) => { let out = OpenOut { fh: handle.map(Into::into).unwrap_or(0), open_flags: opts.bits(), ..Default::default() }; reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn readdir(&self, in_header: InHeader, mut r: Reader, mut w: Writer) -> Result { let ReadIn { fh, offset, size, .. } = r.read_obj().map_err(Error::DecodeMessage)?; if size > MAX_BUFFER_SIZE { return reply_error( io::Error::from_raw_os_error(libc::ENOMEM), in_header.unique, w, ); } let available_bytes = w.available_bytes(); if available_bytes < size as usize { return reply_error( io::Error::from_raw_os_error(libc::ENOMEM), in_header.unique, w, ); } // Skip over enough bytes for the header. let unique = in_header.unique; let mut cursor = w.split_at(size_of::()).unwrap(); let result = match self.fs.readdir( Context::from(in_header), in_header.nodeid.into(), fh.into(), size, offset, ) { Ok(mut entries) => { let mut total_written = 0; let mut err = None; while let Some(dirent) = entries.next() { let remaining = (size as usize).saturating_sub(total_written); match add_dirent(&mut cursor, remaining, dirent, None) { // No more space left in the buffer. Ok(0) => break, Ok(bytes_written) => { total_written += bytes_written; } Err(e) => { err = Some(e); break; } } } if let Some(err) = err { Err(err) } else { Ok(total_written) } } Err(e) => Err(e), }; match result { Ok(total_written) => reply_readdir(total_written, unique, w), Err(e) => reply_error(e, unique, w), } } fn handle_dirent<'d>( &self, in_header: &InHeader, dir_entry: DirEntry<'d>, ) -> io::Result<(DirEntry<'d>, Entry)> { let parent = in_header.nodeid.into(); let name = dir_entry.name.to_bytes(); let entry = if name == CURRENT_DIR_CSTR || name == PARENT_DIR_CSTR { // We use 0 for the inode value to indicate a negative entry. Entry { inode: 0, generation: 0, // Don't do lookups on the current directory or the parent directory, i.e. leave // most fields 0. attr: Attr { ino: dir_entry.ino, mode: dir_entry.type_ << 12, ..Default::default() }, attr_timeout: Duration::from_secs(0), entry_timeout: Duration::from_secs(0), } } else { self.fs .lookup(Context::from(*in_header), parent, dir_entry.name)? }; Ok((dir_entry, entry)) } fn readdirplus(&self, in_header: InHeader, mut r: Reader, mut w: Writer) -> Result { let ReadIn { fh, offset, size, .. } = r.read_obj().map_err(Error::DecodeMessage)?; if size > MAX_BUFFER_SIZE { return reply_error( io::Error::from_raw_os_error(libc::ENOMEM), in_header.unique, w, ); } let available_bytes = w.available_bytes(); if available_bytes < size as usize { return reply_error( io::Error::from_raw_os_error(libc::ENOMEM), in_header.unique, w, ); } // Skip over enough bytes for the header. let unique = in_header.unique; let mut cursor = w.split_at(size_of::()).unwrap(); let result = match self.fs.readdir( Context::from(in_header), in_header.nodeid.into(), fh.into(), size, offset, ) { Ok(mut entries) => { let mut total_written = 0; let mut err = None; while let Some(dirent) = entries.next() { let mut entry_inode = None; let bytes_written = self.handle_dirent(&in_header, dirent).and_then(|(d, e)| { entry_inode = Some(e.inode); let remaining = (size as usize).saturating_sub(total_written); add_dirent(&mut cursor, remaining, d, Some(e)) }); match bytes_written { Ok(0) => { // No more space left in the buffer but we need to undo the lookup // that created the Entry or we will end up with mismatched lookup // counts. if let Some(inode) = entry_inode { self.fs.forget(Context::from(in_header), inode.into(), 1); } break; } Ok(bytes_written) => { total_written += bytes_written; } Err(e) => { if let Some(inode) = entry_inode { self.fs.forget(Context::from(in_header), inode.into(), 1); } if total_written == 0 { // We haven't filled any entries yet so we can just propagate // the error. err = Some(e); } // We already filled in some entries. Returning an error now will // cause lookup count mismatches for those entries so just return // whatever we already have. break; } } } if let Some(err) = err { Err(err) } else { Ok(total_written) } } Err(e) => Err(e), }; match result { Ok(total_written) => reply_readdir(total_written, unique, w), Err(e) => reply_error(e, unique, w), } } fn releasedir(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let ReleaseIn { fh, flags, .. } = r.read_obj().map_err(Error::DecodeMessage)?; match self.fs.releasedir( Context::from(in_header), in_header.nodeid.into(), flags, fh.into(), ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn fsyncdir(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let FsyncIn { fh, fsync_flags, .. } = r.read_obj().map_err(Error::DecodeMessage)?; let datasync = fsync_flags & 0x1 != 0; match self.fs.fsyncdir( Context::from(in_header), in_header.nodeid.into(), datasync, fh.into(), ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn getlk(&self, in_header: InHeader, mut _r: Reader, w: Writer) -> Result { if let Err(e) = self.fs.getlk() { reply_error(e, in_header.unique, w) } else { Ok(0) } } fn setlk(&self, in_header: InHeader, mut _r: Reader, w: Writer) -> Result { if let Err(e) = self.fs.setlk() { reply_error(e, in_header.unique, w) } else { Ok(0) } } fn setlkw(&self, in_header: InHeader, mut _r: Reader, w: Writer) -> Result { if let Err(e) = self.fs.setlkw() { reply_error(e, in_header.unique, w) } else { Ok(0) } } fn access(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let AccessIn { mask, .. } = r.read_obj().map_err(Error::DecodeMessage)?; match self .fs .access(Context::from(in_header), in_header.nodeid.into(), mask) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn create(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let CreateIn { flags, mode, umask, open_flags, .. } = r.read_obj().map_err(Error::DecodeMessage)?; let remaining_len = (in_header.len as usize) .checked_sub(size_of::()) .and_then(|l| l.checked_sub(size_of::())) .ok_or(Error::InvalidHeaderLength)?; let mut buf = vec![0; remaining_len]; r.read_exact(&mut buf).map_err(Error::DecodeMessage)?; let mut components = buf.split_inclusive(|c| *c == b'\0'); let name = components.next().ok_or(Error::MissingParameter)?; let options = FsOptions::from_bits_truncate(self.options.load(Ordering::Relaxed)); let extensions = get_extensions(options, name.len(), buf.as_slice())?; let kill_priv = open_flags & OPEN_KILL_SUIDGID != 0; match self.fs.create( Context::from(in_header), in_header.nodeid.into(), bytes_to_cstr(name)?, mode, kill_priv, flags, umask, extensions, ) { Ok((entry, handle, opts)) => { let entry_out = EntryOut::from(entry); let open_out = OpenOut { fh: handle.map(Into::into).unwrap_or(0), open_flags: opts.bits(), ..Default::default() }; // Kind of a hack to write both structs. reply_ok( Some(entry_out), Some(open_out.as_slice()), in_header.unique, w, ) } Err(e) => reply_error(e, in_header.unique, w), } } fn interrupt(&self, _in_header: InHeader) -> usize { 0 } fn bmap(&self, in_header: InHeader, mut _r: Reader, w: Writer) -> Result { if let Err(e) = self.fs.bmap() { reply_error(e, in_header.unique, w) } else { Ok(0) } } /// Public so the `VhostUserBackend` implementation can call this on device reset pub fn destroy(&self) -> usize { // No reply to this function. self.fs.destroy(); self.options .store(FsOptions::empty().bits(), Ordering::Relaxed); 0 } fn ioctl(&self, in_header: InHeader, _r: Reader, w: Writer) -> Result { if let Err(e) = self.fs.ioctl() { reply_error(e, in_header.unique, w) } else { Ok(0) } } fn poll(&self, in_header: InHeader, mut _r: Reader, w: Writer) -> Result { if let Err(e) = self.fs.poll() { reply_error(e, in_header.unique, w) } else { Ok(0) } } fn notify_reply(&self, in_header: InHeader, mut _r: Reader, w: Writer) -> Result { if let Err(e) = self.fs.notify_reply() { reply_error(e, in_header.unique, w) } else { Ok(0) } } fn batch_forget(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let BatchForgetIn { count, .. } = r.read_obj().map_err(Error::DecodeMessage)?; if let Some(size) = (count as usize).checked_mul(size_of::()) { if size > MAX_BUFFER_SIZE as usize { return reply_error( io::Error::from_raw_os_error(libc::ENOMEM), in_header.unique, w, ); } } else { return reply_error( io::Error::from_raw_os_error(libc::EOVERFLOW), in_header.unique, w, ); } let mut requests = Vec::with_capacity(count as usize); for _ in 0..count { requests.push( r.read_obj::() .map(|f| (f.nodeid.into(), f.nlookup)) .map_err(Error::DecodeMessage)?, ); } self.fs.batch_forget(Context::from(in_header), requests); // No reply for forget messages. Ok(0) } fn fallocate(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let FallocateIn { fh, offset, length, mode, .. } = r.read_obj().map_err(Error::DecodeMessage)?; match self.fs.fallocate( Context::from(in_header), in_header.nodeid.into(), fh.into(), mode, offset, length, ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn lseek(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let LseekIn { fh, offset, whence, .. } = r.read_obj().map_err(Error::DecodeMessage)?; match self.fs.lseek( Context::from(in_header), in_header.nodeid.into(), fh.into(), offset, whence, ) { Ok(offset) => { let out = LseekOut { offset }; reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn copyfilerange(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let CopyfilerangeIn { fh_in, off_in, nodeid_out, fh_out, off_out, len, flags, .. } = r.read_obj().map_err(Error::DecodeMessage)?; match self.fs.copyfilerange( Context::from(in_header), in_header.nodeid.into(), fh_in.into(), off_in, nodeid_out.into(), fh_out.into(), off_out, len, flags, ) { Ok(count) => { let out = WriteOut { size: count as u32, ..Default::default() }; reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn syncfs(&self, in_header: InHeader, w: Writer) -> Result { match self .fs .syncfs(Context::from(in_header), in_header.nodeid.into()) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn tmpfile(&self, in_header: InHeader, _r: Reader, w: Writer) -> Result { let e = self .fs .tmpfile() .err() .unwrap_or_else(|| panic!("unsupported operation")); reply_error(e, in_header.unique, w) } } impl SerializableFileSystem for Server { fn prepare_serialization(&self, cancel: Arc) { self.fs.prepare_serialization(cancel) } fn serialize(&self, state_pipe: File) -> io::Result<()> { self.fs.serialize(state_pipe) } fn deserialize_and_apply(&self, state_pipe: File) -> io::Result<()> { self.fs.deserialize_and_apply(state_pipe) } } fn reply_readdir(len: usize, unique: u64, mut w: Writer) -> Result { let out = OutHeader { len: (size_of::() + len) as u32, error: 0, unique, }; debug!("Replying OK, header: {:?}", out); w.write_all(out.as_slice()).map_err(Error::EncodeMessage)?; w.flush().map_err(Error::FlushMessage)?; Ok(out.len as usize) } fn reply_ok( out: Option, data: Option<&[u8]>, unique: u64, mut w: Writer, ) -> Result { let mut len = size_of::(); if out.is_some() { len += size_of::(); } if let Some(data) = data { len += data.len(); } let header = OutHeader { len: len as u32, error: 0, unique, }; debug!("Replying OK, header: {:?}", header); w.write_all(header.as_slice()) .map_err(Error::EncodeMessage)?; if let Some(out) = out { w.write_all(out.as_slice()).map_err(Error::EncodeMessage)?; } if let Some(data) = data { w.write_all(data).map_err(Error::EncodeMessage)?; } debug_assert_eq!(len, w.bytes_written()); Ok(w.bytes_written()) } fn strerror(error: i32) -> String { let mut err_desc: Vec = vec![0; 256]; let buf_ptr = err_desc.as_mut_ptr() as *mut libc::c_char; // Safe because libc::strerror_r writes in err_desc at most err_desc.len() bytes unsafe { // We ignore the returned value since the two possible error values are: // EINVAL and ERANGE, in the former err_desc will be "Unknown error #" // and in the latter the message will be truncated to fit err_desc libc::strerror_r(error, buf_ptr, err_desc.len()); } let err_desc = err_desc.split(|c| *c == b'\0').next().unwrap(); String::from_utf8(err_desc.to_vec()).unwrap_or_else(|_| "".to_owned()) } fn reply_error(e: io::Error, unique: u64, mut w: Writer) -> Result { let header = OutHeader { len: size_of::() as u32, error: -e.raw_os_error().unwrap_or(libc::EIO), unique, }; debug!( "Replying ERROR, header: OutHeader {{ error: {} ({}), unique: {}, len: {} }}", header.error, strerror(-header.error), header.unique, header.len ); w.write_all(header.as_slice()) .map_err(Error::EncodeMessage)?; debug_assert_eq!(header.len as usize, w.bytes_written()); Ok(w.bytes_written()) } fn bytes_to_cstr(buf: &[u8]) -> Result<&CStr> { // Convert to a `CStr` first so that we can drop the '\0' byte at the end // and make sure there are no interior '\0' bytes. CStr::from_bytes_with_nul(buf).map_err(Error::InvalidCString) } fn add_dirent( cursor: &mut Writer, max: usize, d: DirEntry, entry: Option, ) -> io::Result { // Strip the trailing '\0'. let name = d.name.to_bytes(); if name.len() > u32::MAX as usize { return Err(io::Error::from_raw_os_error(libc::EOVERFLOW)); } let dirent_len = size_of::() .checked_add(name.len()) .ok_or_else(|| io::Error::from_raw_os_error(libc::EOVERFLOW))?; // Directory entries must be padded to 8-byte alignment. If adding 7 causes // an overflow then this dirent cannot be properly padded. let padded_dirent_len = dirent_len .checked_add(7) .map(|l| l & !7) .ok_or_else(|| io::Error::from_raw_os_error(libc::EOVERFLOW))?; let total_len = if entry.is_some() { padded_dirent_len .checked_add(size_of::()) .ok_or_else(|| io::Error::from_raw_os_error(libc::EOVERFLOW))? } else { padded_dirent_len }; if max < total_len { Ok(0) } else { if let Some(entry) = entry { cursor.write_all(EntryOut::from(entry).as_slice())?; } let dirent = Dirent { ino: d.ino, off: d.offset, namelen: name.len() as u32, type_: d.type_, }; cursor.write_all(dirent.as_slice())?; cursor.write_all(name)?; // We know that `dirent_len` <= `padded_dirent_len` due to the check above // so there's no need for checked arithmetic. let padding = padded_dirent_len - dirent_len; if padding > 0 { cursor.write_all(&DIRENT_PADDING[..padding])?; } Ok(total_len) } } fn take_object(data: &[u8]) -> Result<(T, &[u8])> { if data.len() < size_of::() { return Err(Error::DecodeMessage(einval())); } let (object_bytes, remaining_bytes) = data.split_at(size_of::()); // SAFETY: `T` implements `ByteValued` that guarantees that it is safe to instantiate // `T` with random data. let object: T = unsafe { std::ptr::read_unaligned(object_bytes.as_ptr() as *const T) }; Ok((object, remaining_bytes)) } fn parse_security_context(nr_secctx: u32, data: &[u8]) -> Result> { // Although the FUSE security context extension allows sending several security contexts, // currently the guest kernel only sends one. if nr_secctx > 1 { return Err(Error::DecodeMessage(einval())); } else if nr_secctx == 0 { // No security context sent. May be no LSM supports it. return Ok(None); } let (secctx, data) = take_object::(data)?; if secctx.size == 0 { return Err(Error::DecodeMessage(einval())); } let mut components = data.split_inclusive(|c| *c == b'\0'); let secctx_name = components.next().ok_or(Error::MissingParameter)?; let (_, data) = data.split_at(secctx_name.len()); if data.len() < secctx.size as usize { return Err(Error::DecodeMessage(einval())); } // Fuse client aligns the whole security context block to 64 byte // boundary. So it is possible that after actual security context // of secctx.size, there are some null padding bytes left. If // we ever parse more data after secctx, we will have to take those // null bytes into account. Total size (including null bytes) is // available in SecctxHeader->size. let (remaining, _) = data.split_at(secctx.size as usize); let fuse_secctx = SecContext { name: CString::from_vec_with_nul(secctx_name.to_vec()).map_err(Error::InvalidCString2)?, secctx: remaining.to_vec(), }; Ok(Some(fuse_secctx)) } fn parse_sup_groups(data: &[u8]) -> Result { let (group_header, group_id_bytes) = take_object::(data)?; // The FUSE extension allows sending several group IDs, but currently the guest // kernel only sends one. if group_header.nr_groups != 1 { return Err(Error::DecodeMessage(einval())); } let (gid, _) = take_object::(group_id_bytes)?; Ok(gid.into()) } fn get_extensions(options: FsOptions, skip: usize, request_bytes: &[u8]) -> Result { let mut extensions = Extensions::default(); if !(options.contains(FsOptions::SECURITY_CTX) || options.contains(FsOptions::CREATE_SUPP_GROUP)) { return Ok(extensions); } // It's not guaranty to receive an extension even if it's supported by the guest kernel if request_bytes.len() < skip { return Err(Error::DecodeMessage(einval())); } // We need to track if a SecCtx was received, because it's valid // for the guest to send an empty SecCtx (i.e, nr_secctx == 0) let mut secctx_received = false; let mut buf = &request_bytes[skip..]; while !buf.is_empty() { let (extension_header, remaining_bytes) = take_object::(buf)?; let extension_size = (extension_header.size as usize) .checked_sub(size_of::()) .ok_or(Error::InvalidHeaderLength)?; let (current_extension_bytes, next_extension_bytes) = remaining_bytes.split_at(extension_size); let ext_type = ExtType::try_from(extension_header.ext_type) .map_err(|_| Error::DecodeMessage(einval()))?; match ext_type { ExtType::SecCtx(nr_secctx) => { if !options.contains(FsOptions::SECURITY_CTX) || secctx_received { return Err(Error::DecodeMessage(einval())); } secctx_received = true; extensions.secctx = parse_security_context(nr_secctx, current_extension_bytes)?; debug!("Extension received: {} SecCtx", nr_secctx); } ExtType::SupGroups => { if !options.contains(FsOptions::CREATE_SUPP_GROUP) || extensions.sup_gid.is_some() { return Err(Error::DecodeMessage(einval())); } extensions.sup_gid = parse_sup_groups(current_extension_bytes)?.into(); debug!("Extension received: SupGroups({:?})", extensions.sup_gid); } } // Let's process the next extension buf = next_extension_bytes; } // The SupGroup extension can be missing, since it is only sent if needed. // A SecCtx is always sent in create/synlink/mknod/mkdir if supported. if options.contains(FsOptions::SECURITY_CTX) && !secctx_received { return Err(Error::MissingExtension); } Ok(extensions) } virtiofsd-1.13.0/src/soft_idmap/cmdline.rs000064400000000000000000000205631046102023000166220ustar 00000000000000// Copyright 2024 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. /*! * Provides structures to represent ID maps on the command line. * * The actual conversion of the [`Vec`](IdMap) we get from the command line to a * proper [`super::IdMap`] for runtime use is implemented in [`super`] * ([`super::IdMap as * TryFrom>`](`super::IdMap#impl-TryFrom>-for-IdMap`)). */ use std::fmt::{self, Display, Formatter}; use std::num::ParseIntError; use std::str::FromStr; /// Command-line configuration for UID/GID translation between host and guest. #[derive(Clone, Debug)] pub enum IdMap { /// 1:1 translate a guest ID range to a host ID range. Guest { /// First ID in the guest range. from_guest: u32, /// First ID in the host range. to_host: u32, /// Range length. count: u32, }, /// 1:1 translate a host ID range to a guest ID range. Host { /// First ID in the host range. from_host: u32, /// First ID in the guest range. to_guest: u32, /// Range length. count: u32, }, /// n:1 translate a guest ID range to a single host ID. SquashGuest { /// First ID in the guest range. from_guest: u32, /// Single target host ID. to_host: u32, /// Guest range length. count: u32, }, /// n:1 translate a host ID range to a single guest ID. SquashHost { /// First ID in the host range. from_host: u32, /// Single target guest ID. to_guest: u32, /// Host range length. count: u32, }, /// 1:1 translate between a guest ID range and a host ID range, both directions. Bidirectional { /// First ID in the guest range. guest: u32, /// First ID in the host range. host: u32, /// Range length. count: u32, }, /// Prohibit using the given range of guest IDs, returning an error when attempted. ForbidGuest { /// First ID in the guest range. from_guest: u32, /// Range length. count: u32, }, } /// Errors that can occur when parsing an `IdMap` argument. #[derive(Debug)] pub enum IdMapError { /// Invalid/unknown mapping type prefix. InvalidPrefix( /// The prefix in question. String, ), /// Invalid number of arguments. InvalidLength { /// Number of arguments expected. expected: usize, /// Number of arguments actually seen. seen: usize, }, /// Error parsing an integer. InvalidValue { /// The value in question that could not be parsed. value: String, /// The error we got. error: ParseIntError, }, } impl std::error::Error for IdMapError {} impl Display for IdMapError { fn fmt(&self, f: &mut Formatter) -> fmt::Result { match self { IdMapError::InvalidPrefix(prefix) => write!(f, "Invalid ID map prefix {prefix}"), IdMapError::InvalidLength { expected, seen } => write!( f, "Invalid ID map length (expected {expected} elements, got {seen} elements)" ), IdMapError::InvalidValue { value, error } => { write!(f, "Invalid value {value} in ID map: {error}") } } } } impl FromStr for IdMap { type Err = IdMapError; fn from_str(s: &str) -> Result { let (prefix, fields) = Self::pre_parse(s)?; match prefix.as_str() { "guest" => { Self::check_arg_count(&fields, 3)?; Ok(IdMap::Guest { from_guest: fields[0], to_host: fields[1], count: fields[2], }) } "host" => { Self::check_arg_count(&fields, 3)?; Ok(IdMap::Host { from_host: fields[0], to_guest: fields[1], count: fields[2], }) } "squash-guest" => { Self::check_arg_count(&fields, 3)?; Ok(IdMap::SquashGuest { from_guest: fields[0], to_host: fields[1], count: fields[2], }) } "squash-host" => { Self::check_arg_count(&fields, 3)?; Ok(IdMap::SquashHost { from_host: fields[0], to_guest: fields[1], count: fields[2], }) } "forbid-guest" => { Self::check_arg_count(&fields, 2)?; Ok(IdMap::ForbidGuest { from_guest: fields[0], count: fields[1], }) } "map" => { Self::check_arg_count(&fields, 3)?; Ok(IdMap::Bidirectional { guest: fields[0], host: fields[1], count: fields[2], }) } _ => Err(IdMapError::InvalidPrefix(prefix)), } } } impl Display for IdMap { fn fmt(&self, f: &mut Formatter) -> fmt::Result { match self { IdMap::Guest { from_guest, to_host, count, } => { write!(f, "guest:{from_guest}:{to_host}:{count}") } IdMap::Host { from_host, to_guest, count, } => { write!(f, "host:{from_host}:{to_guest}:{count}") } IdMap::SquashGuest { from_guest, to_host, count, } => { write!(f, "squash-guest:{from_guest}:{to_host}:{count}") } IdMap::SquashHost { from_host, to_guest, count, } => { write!(f, "squash-host:{from_host}:{to_guest}:{count}") } IdMap::ForbidGuest { from_guest, count } => { write!(f, "forbid-guest:{from_guest}:{count}") } IdMap::Bidirectional { guest, host, count } => { write!(f, "map:{guest}:{host}:{count}") } } } } impl IdMap { /** * Helper for [`Self::from_str()`]. * * Pre-parse an argument of the form `/^[a-zA-Z0-9_-]*(:[0-9]+){expected_len}$/` (separator * given as a colon here, but is allowed to be any non-alphanumeric separator except `-` and * `_`, though it must be the same for all fields). * * The prefix is returned as a string, the remaining numerical values as a parsed vector. */ fn pre_parse(s: &str) -> Result<(String, Vec), IdMapError> { let mut chars = s.chars(); let mut prefix = String::new(); let separator = loop { let Some(c) = chars.next() else { return Err(IdMapError::InvalidLength { // Not entirely right, but not entirely wrong either. 1 argument is always // expected. expected: 1, seen: 0, }); }; if c.is_alphanumeric() || c == '-' || c == '_' { for c in c.to_lowercase() { prefix.push(c); } } else { break c; } }; let values: Vec<&str> = chars.as_str().split(separator).collect(); let values = values .into_iter() .map(|v| { v.parse().map_err(|error| IdMapError::InvalidValue { value: String::from(v), error, }) }) .collect::, IdMapError>>()?; Ok((prefix, values)) } /// Verifies that `args`’s length is `expected_count`, returning an error otherwise. fn check_arg_count(args: &[u32], expected_count: usize) -> Result<(), IdMapError> { if args.len() != expected_count { Err(IdMapError::InvalidLength { expected: expected_count, seen: args.len(), }) } else { Ok(()) } } } virtiofsd-1.13.0/src/soft_idmap/id_types.rs000064400000000000000000000162761046102023000170350ustar 00000000000000// Copyright 2024 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. /*! * Explicit types for host/guest UIDs/GIDs. * * The types provided by this module make it explicit whether some ID is valid on the host or in * the guest, and whether it is a UID or a GID. Using them ensures proper and complete translation * between host and guest IDs, which would be difficult to prove when using primitive integer * types. */ use btree_range_map::{Measure, PartialEnum, RangePartialOrd}; use std::fmt::{self, Debug, Display, Formatter}; use std::ops::{Add, Sub}; /** * Common trait for all kinds of UIDs and GIDs. * * Its dependencies are: * - `Clone + Copy`: Must consist internally only of a plain integer, so must be copiable. * - `Debug + Display`: We want to easily print the type without deconstructing it. * - `From`: Must be constructable from its inner type (the raw numerical value). * - `Eq + PartialEq + Ord + PartialOrd`: Must be comparable, as would be expected from UIDs/GIDs. * - `Sub`: Must be able to calculate the offset of one ID compare to another, i.e. the * length of an ID range. * - `Send + Sync`: Must be shareable between threads. * - `'static`: Required to construct error objects that can then be put into `io::Error`. */ pub trait Id: Clone + Copy + Debug + Display + Eq + From + Ord + PartialEq + PartialOrd + Send + Sub + Sync + 'static { /** * Inner raw numerical type. * * Should be a primitive integer. `Range` must be usable as the key for a * `btree_range_map::RangeMap`, hence the additional dependencies beyond `Clone + Copy`. */ type Inner: Clone + Copy + Measure + PartialEnum + RangePartialOrd; /// Is this a root UID/GID? fn is_root(&self) -> bool; /// Get the raw numerical value. fn into_inner(self) -> Self::Inner; } /** * Trait designating a guest UID/GID. * * Must be able to add the length of an ID range of the corresponding host type, so we can map one * range to the other, e.g. like so: * ``` * # use std::ops::Range; * # use virtiofsd::soft_idmap::{GuestUid, HostUid, Id}; * # let guest_id_base: GuestUid = 5.into(); * # let host_id_range: Range = (8.into()..13.into()); * let guest_id_range = guest_id_base..(guest_id_base + (host_id_range.end - host_id_range.start)); * # assert!(guest_id_range.start == guest_id_base); * # assert!(guest_id_range.end.into_inner() == 5 + 13 - 8); * ``` * * Or: * ``` * # use std::ops::Range; * # use virtiofsd::soft_idmap::{GuestUid, HostUid, Id}; * # let guest_id_range: Range = (8.into()..13.into()); * # let host_id_range: Range = (21.into()..(21 + 13 - 8).into()); * # let host_id_in_range: HostUid = 23.into(); * let guest_id = guest_id_range.start + (host_id_in_range - host_id_range.start); * # assert!(guest_id.into_inner() == 8 + 23 - 21); * ``` * * (Hence the `Add<::Output, Output = Self>` dependency.) */ pub trait GuestId: Id + Add<::Output, Output = Self> { /// Respective host UID or GID. type HostType: HostId; /// Plain identity mapping to the numerically equal host UID/GID. fn id_mapped(self) -> Self::HostType; } /** * Trait designating a host UID/GID. * * Must be able to add the length of an ID range of the corresponding guest type, so we can map one * range to the other, e.g. like so: * ``` * # use std::ops::Range; * # use virtiofsd::soft_idmap::{GuestUid, HostUid, Id}; * # let host_id_base: HostUid = 13.into(); * # let guest_id_range: Range = (21.into()..34.into()); * let host_id_range = host_id_base..(host_id_base + (guest_id_range.end - guest_id_range.start)); * # assert!(host_id_range.start == host_id_base); * # assert!(host_id_range.end.into_inner() == 13 + 34 - 21); * ``` * * Or: * ``` * # use std::ops::Range; * # use virtiofsd::soft_idmap::{GuestUid, HostUid, Id}; * # let host_id_range: Range = (21.into()..34.into()); * # let guest_id_range: Range = (55.into()..(55 + 34 - 21).into()); * # let guest_id_in_range: GuestUid = 66.into(); * let host_id = host_id_range.start + (guest_id_in_range - guest_id_range.start); * # assert!(host_id.into_inner() == 21 + 66 - 55); * ``` * * (Hence the `Add<::Output, Output = Self>` dependency.) */ pub trait HostId: Id + Add<::Output, Output = Self> { /// Respective guest UID or GID. type GuestType: GuestId; /// Plain identity mapping to the numerically equal guest UID/GID. fn id_mapped(self) -> Self::GuestType; } /// Internal: Implement various traits for ID types. macro_rules! impl_ids { { $( $(#[$meta:meta])* $visibility:vis struct $t:ident< $opposite_name:tt = $opposite_type:ty, OffsetType = $offset_type:tt >($inner:ty): $variant_trait:tt; )* } => { $( $(#[$meta])* #[derive(Clone, Copy, Debug, Default, Eq, Ord, PartialEq, PartialOrd)] #[repr(transparent)] pub struct $t($inner); impl From<$inner> for $t { fn from(id: $inner) -> Self { $t(id) } } impl Id for $t { type Inner = $inner; fn is_root(&self) -> bool { self.0 == 0 } fn into_inner(self) -> $inner { self.0 } } impl $variant_trait for $t { type $opposite_name = $opposite_type; fn id_mapped(self) -> $opposite_type { self.into_inner().into() } } impl Add<$offset_type> for $t { type Output = $t; fn add(self, rhs: $offset_type) -> $t { (self.into_inner() + rhs.0).into() } } impl Sub<$t> for $t { type Output = $offset_type; fn sub(self, rhs: $t) -> $offset_type { $offset_type(self.into_inner() - rhs.into_inner()) } } impl Display for $t { fn fmt(&self, f: &mut Formatter) -> fmt::Result { let inner = (*self).into_inner(); write!(f, "{inner}") } } )* }; } /// Offset between two UIDs pub struct UidOffset(u32); /// Offset between two GIDs pub struct GidOffset(u32); impl_ids! { /// Guest UID type, i.e. a UID used in the guest. pub struct GuestUid(u32): GuestId; /// Guest GID type, i.e. a GID used in the guest. pub struct GuestGid(u32): GuestId; /// Host UID type, i.e. a UID valid on the host. pub struct HostUid(libc::uid_t): HostId; /// Host UID type, i.e. a GID valid on the host. pub struct HostGid(libc::gid_t): HostId; } virtiofsd-1.13.0/src/soft_idmap/mod.rs000064400000000000000000000252111046102023000157610ustar 00000000000000// Copyright 2024 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. /*! * Facilities for mapping UIDs/GIDs within virtiofsd. * * This module provides various facilities to map UIDs/GIDs between host and guest, with separate * translation functions in either direction. */ pub mod cmdline; pub mod id_types; use crate::util::{other_io_error, ResultErrorContext}; use btree_range_map::RangeMap; pub use id_types::{GuestGid, GuestId, GuestUid, HostGid, HostId, HostUid, Id}; use std::convert::TryFrom; use std::fmt::{self, Display, Formatter}; use std::io; use std::ops::{Add, Range, Sub}; /** * Provides mappings for UIDs or GIDs between host and guest. * * Each `IdMap` will only translate UIDs or GIDs, not both. Translation in either direction (host * to guest, guest to host) is independent of the other direction, i.e. does not need to be * bijective (invertible). */ pub struct IdMap, Host: HostId> { /// Guest-to-host mapping. guest_to_host: RangeMap>, /// Host-to-guest mapping. host_to_guest: RangeMap>, } /** * Maps a range of IDs. * * Can be either UIDs or GIDs, and either host to guest or guest to host. */ #[derive(Clone, Debug, PartialEq)] enum MapEntry { /// Squash a range of IDs onto a single one. Squash { /// Range of source IDs. from: Range, /// Single target ID. to: Target, }, /// 1:1 map a range of IDs to another range (of the same length). Range { /// Range of source IDs. from: Range, /// First ID in the target range (i.e. mapping for `from.start`). to_base: Target, }, /// Disallow using this ID range: Return an error. Fail { /// Range of source IDs. from: Range, }, } #[derive(Clone, Debug)] pub enum MapError { ExplicitFailMapping { id: Source }, } impl IdMap where Guest: GuestId, Host: HostId, { /** * Create an empty map. * * Note that unmapped ranges default to identity mapping, i.e. an empty map will map everything * to itself (numerically speaking). */ pub fn empty() -> Self { IdMap { guest_to_host: RangeMap::new(), host_to_guest: RangeMap::new(), } } /// Map a guest UID/GID to one in the host domain. pub fn map_guest(&self, guest_id: Guest) -> Result> { self.guest_to_host .get(guest_id.into_inner()) .map(|e| e.map(guest_id)) .unwrap_or(Ok(guest_id.id_mapped())) } /// Map a host UID/GID to one in the guest domain. pub fn map_host(&self, host_id: Host) -> Result> { self.host_to_guest .get(host_id.into_inner()) .map(|e| e.map(host_id)) .unwrap_or(Ok(host_id.id_mapped())) } /** * Add a new mapping. * * Internal helper for [`Self::push_guest_to_host()`] and [`Self::push_host_to_guest()`]. * * `map` points to either `self.guest_to_host` or `self.host_to_guest`. `map_name` should be * `"Guest-to-host"` or `"Host-to-guest"` accordingly, and is only used to generate potential * error messages. */ fn do_push( map: &mut RangeMap>, map_name: &str, entry: MapEntry, ) -> io::Result<()> where Source: Id + Sub, Target: Id + Add<::Output, Output = Target>, { let wrapped_range = entry.source_range(); let inner_range = Range { start: wrapped_range.start.into_inner(), end: wrapped_range.end.into_inner(), }; if map.intersects(inner_range.clone()) { return Err(other_io_error(format!( "{map_name} mapping '{entry}' intersects previously added entry" ))); } map.insert(inner_range, entry); Ok(()) } /** * Add a new mapping of guest IDs to host ID(s). * * Internal helper for [`Self as * TryFrom>`](`Self#impl-TryFrom>-for-IdMap`). */ fn push_guest_to_host(&mut self, entry: MapEntry) -> io::Result<()> { Self::do_push(&mut self.guest_to_host, "Guest-to-host", entry) } /** * Add a new mapping of host IDs to guest ID(s). * * Internal helper for [`Self as * TryFrom>`](`Self#impl-TryFrom>-for-IdMap`). */ fn push_host_to_guest(&mut self, entry: MapEntry) -> io::Result<()> { Self::do_push(&mut self.host_to_guest, "Host-to-guest", entry) } } impl MapEntry where Source: Sub, Target: Add<::Output, Output = Target>, { /// Map an element from the source domain into the target domain. fn map(&self, id: Source) -> Result> { match self { MapEntry::Squash { from, to } => { assert!(from.contains(&id)); Ok(*to) } MapEntry::Range { from, to_base } => { assert!(from.contains(&id)); Ok(*to_base + (id - from.start)) } MapEntry::Fail { from } => { assert!(from.contains(&id)); Err(MapError::ExplicitFailMapping { id }) } } } /// Return the source ID range. fn source_range(&self) -> &Range { match self { MapEntry::Squash { from, to: _ } => from, MapEntry::Range { from, to_base: _ } => from, MapEntry::Fail { from } => from, } } } impl Display for MapError { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { MapError::ExplicitFailMapping { id } => { write!(f, "Use of ID {id} has been configured to fail") } } } } impl std::error::Error for MapError {} impl From> for io::Error { fn from(err: MapError) -> Self { io::Error::new(io::ErrorKind::PermissionDenied, err) } } impl Display for MapEntry where Source: Sub, Target: Add<::Output, Output = Target>, { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { MapEntry::Squash { from, to } => { write!(f, "squash [{}, {}) to {}", from.start, from.end, to) } MapEntry::Range { from, to_base } => { write!( f, "map [{}, {}) to [{}, {})", from.start, from.end, to_base, *to_base + (from.end - from.start) ) } MapEntry::Fail { from } => { write!(f, "fail [{}, {})", from.start, from.end) } } } } fn id_range_from_u32(base: u32, count: u32, param: P) -> io::Result> where u32: Into, { let start: I = base.into(); let end: I = base .checked_add(count) .ok_or_else(|| { io::Error::new( io::ErrorKind::InvalidInput, format!("Parameter {param}: Range overflow"), ) })? .into(); Ok(start..end) } impl TryFrom> for IdMap where Guest: GuestId + From, Host: HostId + From, { type Error = io::Error; /// Convert from the command line representation to our runtime object. fn try_from(cmdline: Vec) -> io::Result { let mut map = IdMap::empty(); for entry in cmdline { match entry { cmdline::IdMap::Guest { from_guest, to_host, count, } => map .push_guest_to_host(MapEntry::Range { from: id_range_from_u32(from_guest, count, &entry)?, to_base: to_host.into(), }) .err_context(|| entry)?, cmdline::IdMap::Host { from_host, to_guest, count, } => map .push_host_to_guest(MapEntry::Range { from: id_range_from_u32(from_host, count, &entry)?, to_base: to_guest.into(), }) .err_context(|| entry)?, cmdline::IdMap::SquashGuest { from_guest, to_host, count, } => map .push_guest_to_host(MapEntry::Squash { from: id_range_from_u32(from_guest, count, &entry)?, to: to_host.into(), }) .err_context(|| entry)?, cmdline::IdMap::SquashHost { from_host, to_guest, count, } => map .push_host_to_guest(MapEntry::Squash { from: id_range_from_u32(from_host, count, &entry)?, to: to_guest.into(), }) .err_context(|| entry)?, cmdline::IdMap::Bidirectional { guest, host, count } => { map.push_guest_to_host(MapEntry::Range { from: id_range_from_u32(guest, count, &entry)?, to_base: host.into(), }) .err_context(|| &entry)?; map.push_host_to_guest(MapEntry::Range { from: id_range_from_u32(host, count, &entry)?, to_base: guest.into(), }) .err_context(|| &entry)?; } cmdline::IdMap::ForbidGuest { from_guest, count } => { map.push_guest_to_host(MapEntry::Fail { from: (from_guest.into())..((from_guest + count).into()), }) .err_context(|| &entry)?; } } } Ok(map) } } virtiofsd-1.13.0/src/util.rs000064400000000000000000000152301046102023000140320ustar 00000000000000// Copyright 2022 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use std::fs::{File, OpenOptions}; use std::io::{Error, Write}; use std::os::unix::fs::{MetadataExt, OpenOptionsExt}; use std::os::unix::io::{AsRawFd, FromRawFd}; use std::path::Path; use std::{fs, io, process}; fn try_lock_file(file: &File) -> Result<(), Error> { // Safe because 'file' must exist and we check the return value. let file_fd = file.as_raw_fd(); let ret = unsafe { libc::flock(file_fd, libc::LOCK_EX | libc::LOCK_NB) }; if ret == -1 { return Err(Error::last_os_error()); } Ok(()) } pub fn write_pid_file(pid_file_name: &Path) -> Result { let mut pid_file = loop { let file = OpenOptions::new() .mode(libc::S_IRUSR | libc::S_IWUSR) .custom_flags(libc::O_CLOEXEC) .write(true) .create(true) .open(pid_file_name)?; try_lock_file(&file)?; // Let's make sure the file we locked still exists in the filesystem. let locked = file.metadata()?.ino(); let current = match fs::metadata(pid_file_name) { Ok(stat) => stat.ino(), _ => continue, // the pid file got removed or some error happened, try again. }; if locked == current { break file; // lock successfully acquired. } // the file changed, other process is racing with us, so try again. }; let pid = format!("{}\n", process::id()); pid_file.write_all(pid.as_bytes())?; Ok(pid_file) } unsafe fn pidfd_open(pid: libc::pid_t, flags: libc::c_uint) -> libc::c_int { libc::syscall(libc::SYS_pidfd_open, pid, flags) as libc::c_int } /// Helper function to create a process and sets the parent process /// death signal SIGTERM pub fn sfork() -> io::Result { let cur_pid = unsafe { libc::getpid() }; // We use pidfd_open(2) to check the parent's pid because if the // child is created inside a pid namespace, getppid(2) will always // return 0 let parent_pidfd = unsafe { pidfd_open(cur_pid, 0) }; if parent_pidfd == -1 { return Err(Error::last_os_error()); } // We wrap the parent PID file descriptor in a File object to ensure that is // auto-closed when it goes out of scope. But, since nothing can be read, using read(2), // from a PID file descriptor returned by pidfd_open(2) (it fails with EINVAL), we // use a new type PidFd to prevent using the File's methods directly, and in the hope // that whoever wants to do so will read this first. // This is a temporary solution until OwnedFd is stabilized. #[allow(dead_code)] struct PidFd(File); let _pidfd = unsafe { PidFd(File::from_raw_fd(parent_pidfd)) }; let child_pid = unsafe { libc::fork() }; if child_pid == -1 { return Err(Error::last_os_error()); } if child_pid == 0 { // Request to receive SIGTERM on parent's death. let ret = unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM) }; assert_eq!(ret, 0); // This shouldn't fail because libc::SIGTERM is a valid signal number // Check if the original parent died before libc::prctl() was called let mut pollfds = libc::pollfd { fd: parent_pidfd, events: libc::POLLIN, revents: 0, }; let num_fds = unsafe { libc::poll(&mut pollfds, 1, 0) }; if num_fds == -1 { return Err(io::Error::last_os_error()); } if num_fds != 0 { // The original parent died return Err(other_io_error("Parent process died unexpectedly")); } } Ok(child_pid) } pub fn wait_for_child(pid: i32) -> ! { // Drop all capabilities, since the parent doesn't require any // capabilities, as it'd be just waiting for the child to exit. capng::clear(capng::Set::BOTH); if let Err(e) = capng::apply(capng::Set::BOTH) { // Don't exit the process here since we already have a child. error!("warning: can't apply the parent capabilities: {}", e); } let mut status = 0; // On success, `libc::waitpid()` returns the PID of the child. if unsafe { libc::waitpid(pid, &mut status, 0) } != pid { error!("Error during waitpid()"); process::exit(1); } let exit_code = if libc::WIFEXITED(status) { libc::WEXITSTATUS(status) } else if libc::WIFSIGNALED(status) { let signal = libc::WTERMSIG(status); error!("Child process terminated by signal {}", signal); -signal } else { error!("Unexpected waitpid status: {:#X}", status); libc::EXIT_FAILURE }; process::exit(exit_code); } /// Add a capability to the effective set /// # Errors /// An error variant will be returned: /// - if the input string does not match the name, without the 'CAP_' prefix, of any of the /// capabilities defined in `linux/capabiliy.h`. /// - if `capng::get_caps_process()` cannot get the capabilities and bounding set of the process. /// - if `capng::update()` fails to update the internal posix capabilities settings. /// - if `capng::apply()` fails to transfer the specified internal posix capabilities settings to /// the kernel. pub fn add_cap_to_eff(cap_name: &str) -> capng::Result<()> { use capng::{Action, CUpdate, Set, Type}; let cap = capng::name_to_capability(cap_name)?; capng::get_caps_process()?; let req = vec![CUpdate { action: Action::ADD, cap_type: Type::EFFECTIVE, capability: cap, }]; capng::update(req)?; capng::apply(Set::CAPS)?; Ok(()) } /// Same as `io::Error::other()`, but the respective io_error_other feature has only been /// stabilized in Rust 1.74.0, which is too new for our intended targets. pub fn other_io_error>>(err: E) -> io::Error { io::Error::new(io::ErrorKind::Other, err) } /// Trait for `Error` object that allows prepending the error message by something that gives /// context pub trait ErrorContext { fn context(self, context: C) -> Self; } impl ErrorContext for io::Error { fn context(self, context: C) -> Self { io::Error::new(self.kind(), format!("{context}: {self}")) } } /// Lifts the `ErrorContext` trait to `Result` types pub trait ResultErrorContext { fn err_context C>(self, context: F) -> Self; } impl ResultErrorContext for Result { fn err_context C>(self, context: F) -> Self { self.map_err(|err| err.context(context())) } } virtiofsd-1.13.0/src/vhost_user.rs000064400000000000000000000627011046102023000152630ustar 00000000000000// Copyright 2019 Intel Corporation. All Rights Reserved. // // SPDX-License-Identifier: (Apache-2.0 AND BSD-3-Clause) use std::convert::TryInto; use std::fs::File; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex, RwLock}; use std::thread::{self, JoinHandle}; use std::{convert, error, fmt, io}; use futures::executor::{ThreadPool, ThreadPoolBuilder}; use libc::EFD_NONBLOCK; use log::*; use vhost::vhost_user::message::*; use vhost::vhost_user::Backend; use vhost_user_backend::bitmap::BitmapMmapRegion; use vhost_user_backend::{VhostUserBackend, VringMutex, VringState, VringT}; use virtio_bindings::bindings::virtio_config::*; use virtio_bindings::bindings::virtio_ring::{ VIRTIO_RING_F_EVENT_IDX, VIRTIO_RING_F_INDIRECT_DESC, }; use virtio_queue::{DescriptorChain, QueueOwnedT}; use vm_memory::{ ByteValued, GuestAddressSpace, GuestMemoryAtomic, GuestMemoryLoadGuard, GuestMemoryMmap, Le32, }; use vmm_sys_util::epoll::EventSet; use vmm_sys_util::eventfd::EventFd; use crate::descriptor_utils::{Error as VufDescriptorError, Reader, Writer}; use crate::filesystem::{FileSystem, SerializableFileSystem}; use crate::server::Server; use crate::util::other_io_error; use crate::Error as VhostUserFsError; type LoggedMemory = GuestMemoryMmap; type LoggedMemoryAtomic = GuestMemoryAtomic; const QUEUE_SIZE: usize = 32768; // The spec allows for multiple request queues. We currently only support one. const REQUEST_QUEUES: u32 = 1; // In addition to the request queue there is one high-prio queue. // Since VIRTIO_FS_F_NOTIFICATION is not advertised we do not have a // notification queue. const NUM_QUEUES: usize = REQUEST_QUEUES as usize + 1; // The guest queued an available buffer for the high priority queue. const HIPRIO_QUEUE_EVENT: u16 = 0; // The guest queued an available buffer for the request queue. const REQ_QUEUE_EVENT: u16 = 1; /// The maximum length of the tag being used. pub const MAX_TAG_LEN: usize = 36; type Result = std::result::Result; // The compiler warns that some wrapped values are never read, but they are in fact read by // `::fmt()` via the derived `Debug`. #[allow(dead_code)] #[derive(Debug)] pub enum Error { /// Failed to create kill eventfd. CreateKillEventFd(io::Error), /// Failed to create thread pool. CreateThreadPool(io::Error), /// Failed to handle event other than input event. HandleEventNotEpollIn, /// Failed to handle unknown event. HandleEventUnknownEvent, /// Iterating through the queue failed. IterateQueue, /// No memory configured. NoMemoryConfigured, /// Processing queue failed. ProcessQueue(VhostUserFsError), /// Creating a queue reader failed. QueueReader(VufDescriptorError), /// Creating a queue writer failed. QueueWriter(VufDescriptorError), /// The unshare(CLONE_FS) call failed. UnshareCloneFs(io::Error), /// Invalid tag name InvalidTag, } impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use self::Error::UnshareCloneFs; match self { UnshareCloneFs(error) => { write!( f, "The unshare(CLONE_FS) syscall failed with '{error}'. \ If running in a container please check that the container \ runtime seccomp policy allows unshare." ) } Self::InvalidTag => write!( f, "The tag may not be empty or longer than {MAX_TAG_LEN} bytes (encoded as UTF-8)." ), _ => write!(f, "{self:?}"), } } } impl error::Error for Error {} impl convert::From for io::Error { fn from(e: Error) -> Self { other_io_error(e) } } struct VhostUserFsThread { mem: Option, kill_evt: EventFd, server: Arc>, // handle request from backend to frontend vu_req: Option, event_idx: bool, pool: Option, } impl VhostUserFsThread { fn new(fs: F, thread_pool_size: usize) -> Result { let pool = if thread_pool_size > 0 { // Test that unshare(CLONE_FS) works, it will be called for each thread. // It's an unprivileged system call but some Docker/Moby versions are // known to reject it via seccomp when CAP_SYS_ADMIN is not given. // // Note that the program is single-threaded here so this syscall has no // visible effect and is safe to make. let ret = unsafe { libc::unshare(libc::CLONE_FS) }; if ret == -1 { return Err(Error::UnshareCloneFs(std::io::Error::last_os_error())); } Some( ThreadPoolBuilder::new() .after_start(|_| { // unshare FS for xattr operation let ret = unsafe { libc::unshare(libc::CLONE_FS) }; assert_eq!(ret, 0); // Should not fail }) .pool_size(thread_pool_size) .create() .map_err(Error::CreateThreadPool)?, ) } else { None }; Ok(VhostUserFsThread { mem: None, kill_evt: EventFd::new(EFD_NONBLOCK).map_err(Error::CreateKillEventFd)?, server: Arc::new(Server::new(fs)), vu_req: None, event_idx: false, pool, }) } fn return_descriptor( vring_state: &mut VringState, head_index: u16, event_idx: bool, len: usize, ) { let used_len: u32 = match len.try_into() { Ok(l) => l, Err(_) => panic!("Invalid used length, can't return used descritors to the ring"), }; if vring_state.add_used(head_index, used_len).is_err() { warn!("Couldn't return used descriptors to the ring"); } if event_idx { match vring_state.needs_notification() { Err(_) => { warn!("Couldn't check if queue needs to be notified"); vring_state.signal_used_queue().unwrap(); } Ok(needs_notification) => { if needs_notification { vring_state.signal_used_queue().unwrap(); } } } } else { vring_state.signal_used_queue().unwrap(); } } fn process_queue_pool(&self, vring: VringMutex) -> Result { let mut used_any = false; let atomic_mem = match &self.mem { Some(m) => m, None => return Err(Error::NoMemoryConfigured), }; while let Some(avail_desc) = vring .get_mut() .get_queue_mut() .iter(atomic_mem.memory()) .map_err(|_| Error::IterateQueue)? .next() { used_any = true; // Prepare a set of objects that can be moved to the worker thread. let atomic_mem = atomic_mem.clone(); let server = self.server.clone(); let mut vu_req = self.vu_req.clone(); let event_idx = self.event_idx; let worker_vring = vring.clone(); let worker_desc = avail_desc.clone(); self.pool.as_ref().unwrap().spawn_ok(async move { let mem = atomic_mem.memory(); let head_index = worker_desc.head_index(); let reader = Reader::new(&mem, worker_desc.clone()) .map_err(Error::QueueReader) .unwrap(); let writer = Writer::new(&mem, worker_desc.clone()) .map_err(Error::QueueWriter) .unwrap(); let len = server .handle_message(reader, writer, vu_req.as_mut()) .map_err(Error::ProcessQueue) .unwrap(); Self::return_descriptor(&mut worker_vring.get_mut(), head_index, event_idx, len); }); } Ok(used_any) } fn process_queue_serial( &self, vring_state: &mut VringState, ) -> Result { let mut used_any = false; let mem = match &self.mem { Some(m) => m.memory(), None => return Err(Error::NoMemoryConfigured), }; let mut vu_req = self.vu_req.clone(); let avail_chains: Vec>> = vring_state .get_queue_mut() .iter(mem.clone()) .map_err(|_| Error::IterateQueue)? .collect(); for chain in avail_chains { used_any = true; let head_index = chain.head_index(); let reader = Reader::new(&mem, chain.clone()) .map_err(Error::QueueReader) .unwrap(); let writer = Writer::new(&mem, chain.clone()) .map_err(Error::QueueWriter) .unwrap(); let len = self .server .handle_message(reader, writer, vu_req.as_mut()) .map_err(Error::ProcessQueue) .unwrap(); Self::return_descriptor(vring_state, head_index, self.event_idx, len); } Ok(used_any) } fn handle_event_pool( &self, device_event: u16, vrings: &[VringMutex], ) -> io::Result<()> { let idx = match device_event { HIPRIO_QUEUE_EVENT => { debug!("HIPRIO_QUEUE_EVENT"); 0 } REQ_QUEUE_EVENT => { debug!("QUEUE_EVENT"); 1 } _ => return Err(Error::HandleEventUnknownEvent.into()), }; if self.event_idx { // vm-virtio's Queue implementation only checks avail_index // once, so to properly support EVENT_IDX we need to keep // calling process_queue() until it stops finding new // requests on the queue. loop { vrings[idx].disable_notification().unwrap(); // we can't recover from an error here, so let's hope it's transient if let Err(e) = self.process_queue_pool(vrings[idx].clone()) { error!("processing the vring {idx}: {e}"); } if !vrings[idx].enable_notification().unwrap() { break; } } } else { // Without EVENT_IDX, a single call is enough. self.process_queue_pool(vrings[idx].clone())?; } Ok(()) } fn handle_event_serial( &self, device_event: u16, vrings: &[VringMutex], ) -> io::Result<()> { let mut vring_state = match device_event { HIPRIO_QUEUE_EVENT => { debug!("HIPRIO_QUEUE_EVENT"); vrings[0].get_mut() } REQ_QUEUE_EVENT => { debug!("QUEUE_EVENT"); vrings[1].get_mut() } _ => return Err(Error::HandleEventUnknownEvent.into()), }; if self.event_idx { // vm-virtio's Queue implementation only checks avail_index // once, so to properly support EVENT_IDX we need to keep // calling process_queue() until it stops finding new // requests on the queue. loop { vring_state.disable_notification().unwrap(); // we can't recover from an error here, so let's hope it's transient if let Err(e) = self.process_queue_serial(&mut vring_state) { error!("processing the vring: {e}"); } if !vring_state.enable_notification().unwrap() { break; } } } else { // Without EVENT_IDX, a single call is enough. self.process_queue_serial(&mut vring_state)?; } Ok(()) } } #[repr(C)] #[derive(Clone, Copy)] struct VirtioFsConfig { tag: [u8; MAX_TAG_LEN], num_request_queues: Le32, } // vm-memory needs a Default implementation even though these values are never // used anywhere... impl Default for VirtioFsConfig { fn default() -> Self { Self { tag: [0; MAX_TAG_LEN], num_request_queues: Le32::default(), } } } unsafe impl ByteValued for VirtioFsConfig {} struct PremigrationThread { handle: JoinHandle<()>, cancel: Arc, } /// A builder for configurable creation of [`VhostUserFsBackend`] objects. #[derive(Debug, Default)] pub struct VhostUserFsBackendBuilder { thread_pool_size: usize, tag: Option, } impl VhostUserFsBackendBuilder { /// Adjust the size of the thread pool to use. /// /// A value of `0` disables the usage of a thread pool. pub fn set_thread_pool_size(mut self, size: usize) -> Self { self.thread_pool_size = size; self } /// Set the tag to use for the file system. /// /// The tag length must not exceed [`MAX_TAG_LEN`] bytes. pub fn set_tag(mut self, tag: Option) -> Self { self.tag = tag; self } /// Build the [`VhostUserFsBackend`] object. pub fn build(self, fs: F) -> Result> where F: FileSystem + SerializableFileSystem + Send + Sync + 'static, { let thread = RwLock::new(VhostUserFsThread::new(fs, self.thread_pool_size)?); Ok(VhostUserFsBackend { thread, premigration_thread: None.into(), migration_thread: None.into(), tag: self.tag, }) } } pub struct VhostUserFsBackend { thread: RwLock>, premigration_thread: Mutex>, migration_thread: Mutex>>>, tag: Option, } impl VhostUserFsBackend { /// Create a [`VhostUserFsBackend`] without a thread pool or a tag. /// /// For more configurable creation refer to /// [`VhostUserFsBackendBuilder`]. pub fn new(fs: F) -> Result { VhostUserFsBackendBuilder::default().build(fs) } } impl VhostUserBackend for VhostUserFsBackend { type Bitmap = BitmapMmapRegion; type Vring = VringMutex; fn num_queues(&self) -> usize { NUM_QUEUES } fn max_queue_size(&self) -> usize { QUEUE_SIZE } fn features(&self) -> u64 { 1 << VIRTIO_F_VERSION_1 | 1 << VIRTIO_RING_F_INDIRECT_DESC | 1 << VIRTIO_RING_F_EVENT_IDX | VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits() | VhostUserVirtioFeatures::LOG_ALL.bits() } fn protocol_features(&self) -> VhostUserProtocolFeatures { let mut protocol_features = VhostUserProtocolFeatures::MQ | VhostUserProtocolFeatures::BACKEND_REQ | VhostUserProtocolFeatures::BACKEND_SEND_FD | VhostUserProtocolFeatures::REPLY_ACK | VhostUserProtocolFeatures::CONFIGURE_MEM_SLOTS | VhostUserProtocolFeatures::LOG_SHMFD | VhostUserProtocolFeatures::DEVICE_STATE | VhostUserProtocolFeatures::RESET_DEVICE; if self.tag.is_some() { protocol_features |= VhostUserProtocolFeatures::CONFIG; } protocol_features } fn get_config(&self, offset: u32, size: u32) -> Vec { // virtio spec 1.2, 5.11.4: // The tag is encoded in UTF-8 and padded with NUL bytes if shorter than // the available space. This field is not NUL-terminated if the encoded // bytes take up the entire field. // The length was already checked when parsing the arguments. Hence, we // only assert that everything looks sane and pad with NUL bytes to the // fixed length. let tag = self.tag.as_ref().expect("Did not expect read of config if tag is not set. We do not advertise F_CONFIG in that case!"); assert!(tag.len() <= MAX_TAG_LEN, "too long tag length"); assert!(!tag.is_empty(), "tag should not be empty"); let mut fixed_len_tag = [0; MAX_TAG_LEN]; fixed_len_tag[0..tag.len()].copy_from_slice(tag.as_bytes()); let config = VirtioFsConfig { tag: fixed_len_tag, num_request_queues: Le32::from(REQUEST_QUEUES), }; let offset = offset as usize; let size = size as usize; let mut result: Vec<_> = config .as_slice() .iter() .skip(offset) .take(size) .copied() .collect(); // pad with 0s up to `size` result.resize(size, 0); result } fn acked_features(&self, features: u64) { if features & VhostUserVirtioFeatures::LOG_ALL.bits() != 0 { // F_LOG_ALL set: Prepare for migration (unless we're already doing that) let mut premigration_thread = self.premigration_thread.lock().unwrap(); if premigration_thread.is_none() { let cancel = Arc::new(AtomicBool::new(false)); let cloned_server = Arc::clone(&self.thread.read().unwrap().server); let cloned_cancel = Arc::clone(&cancel); let handle = thread::spawn(move || cloned_server.prepare_serialization(cloned_cancel)); *premigration_thread = Some(PremigrationThread { handle, cancel }); } } else { // F_LOG_ALL cleared: Migration cancelled, if any was ongoing // (Note that this is our interpretation, and not said by the specification. The back // end might clear this flag also on the source side once the VM has been stopped, even // before we receive SET_DEVICE_STATE_FD. QEMU will clear F_LOG_ALL only when the VM // is running, i.e. when the source resumes after a cancelled migration, which is // exactly what we want, but it would be better if we had a more reliable way that is // backed up by the spec. We could delay cancelling until we receive a guest request // while F_LOG_ALL is cleared, but that can take an indefinite amount of time.) if let Some(premigration_thread) = self.premigration_thread.lock().unwrap().take() { premigration_thread.cancel.store(true, Ordering::Relaxed); // Ignore the result, we are cancelling anyway let _ = premigration_thread.handle.join(); } } } fn reset_device(&self) { // Clear our device state self.thread.write().unwrap().server.destroy(); } fn set_event_idx(&self, enabled: bool) { self.thread.write().unwrap().event_idx = enabled; } fn update_memory(&self, mem: LoggedMemoryAtomic) -> io::Result<()> { self.thread.write().unwrap().mem = Some(mem); Ok(()) } fn handle_event( &self, device_event: u16, evset: EventSet, vrings: &[VringMutex], _thread_id: usize, ) -> io::Result<()> { if evset != EventSet::IN { return Err(Error::HandleEventNotEpollIn.into()); } let thread = self.thread.read().unwrap(); if thread.pool.is_some() { thread.handle_event_pool(device_event, vrings) } else { thread.handle_event_serial(device_event, vrings) } } fn exit_event(&self, _thread_index: usize) -> Option { Some(self.thread.read().unwrap().kill_evt.try_clone().unwrap()) } fn set_backend_req_fd(&self, vu_req: Backend) { self.thread.write().unwrap().vu_req = Some(vu_req); } fn set_device_state_fd( &self, direction: VhostTransferStateDirection, phase: VhostTransferStatePhase, file: File, ) -> io::Result> { // Our caller (vhost-user-backend crate) pretty much ignores error objects we return (only // cares whether we succeed or not), so log errors here if let Err(err) = self.do_set_device_state_fd(direction, phase, file) { error!("Failed to initiate state (de-)serialization: {err}"); return Err(err); } Ok(None) } fn check_device_state(&self) -> io::Result<()> { // Our caller (vhost-user-backend crate) pretty much ignores error objects we return (only // cares whether we succeed or not), so log errors here if let Err(err) = self.do_check_device_state() { error!("Migration failed: {err}"); return Err(err); } Ok(()) } } impl VhostUserFsBackend { fn do_set_device_state_fd( &self, direction: VhostTransferStateDirection, phase: VhostTransferStatePhase, file: File, ) -> io::Result<()> { if phase != VhostTransferStatePhase::STOPPED { return Err(io::Error::new( io::ErrorKind::Unsupported, format!("Transfer in phase {:?} is not supported", phase), )); } let server = Arc::clone(&self.thread.read().unwrap().server); let join_handle = match direction { VhostTransferStateDirection::SAVE => { // We should have a premigration thread that was started with `F_LOG_ALL`. It // should already be finished, but you never know. let premigration_thread = self.premigration_thread.lock().unwrap().take(); thread::spawn(move || { if let Some(premigration_thread) = premigration_thread { // Let’s hope it’s finished. Otherwise, we block migration downtime for a // bit longer, but there’s nothing we can do. premigration_thread.handle.join().map_err(|_| { other_io_error( "Failed to finalize serialization preparation".to_string(), ) })?; } else { // If we don’t have a premigration thread, that either means migration was // cancelled at some point (i.e. F_LOG_ALL cleared; very unlikely and we // consider sending SET_DEVICE_STATE_FD afterwards a protocol violation), // or that there simply was no F_LOG_ALL at all. QEMU doesn’t necessarily // do memory logging when snapshotting, and in such cases we have no choice // but to just run preserialization now. warn!( "Front-end did not announce migration to begin, so we failed to \ prepare for it; collecting data now. If you are doing a snapshot, \ that is OK; otherwise, migration downtime may be prolonged." ); server.prepare_serialization(Arc::new(AtomicBool::new(false))); } server.serialize(file).map_err(|e| { io::Error::new(e.kind(), format!("Failed to save state: {}", e)) }) }) } VhostTransferStateDirection::LOAD => { if let Some(premigration_thread) = self.premigration_thread.lock().unwrap().take() { // Strange, but OK premigration_thread.cancel.store(true, Ordering::Relaxed); warn!("Cancelling serialization preparation because of incoming migration"); let _ = premigration_thread.handle.join(); } thread::spawn(move || { server.deserialize_and_apply(file).map_err(|e| { io::Error::new(e.kind(), format!("Failed to load state: {}", e)) }) }) } }; *self.migration_thread.lock().unwrap() = Some(join_handle); Ok(()) } fn do_check_device_state(&self) -> io::Result<()> { let Some(migration_thread) = self.migration_thread.lock().unwrap().take() else { // `check_device_state()` must follow a successful `set_device_state_fd()`, so this is // a protocol violation return Err(io::Error::new( io::ErrorKind::InvalidInput, "Front-end attempts to check migration state, but no migration has been done", )); }; migration_thread .join() .map_err(|_| other_io_error("Failed to join the migration thread"))? } } impl Drop for VhostUserFsBackend { fn drop(&mut self) { let result = self .thread .read() .unwrap_or_else(|err| err.into_inner()) .kill_evt .write(1); if let Err(e) = result { error!("Error shutting down worker thread: {:?}", e) } } }