virtiofsd-1.10.0/.cargo_vcs_info.json0000644000000001360000000000100131240ustar { "git": { "sha1": "3c1f1c626319cada80408a63b043c5c6d9235f83" }, "path_in_vcs": "" }virtiofsd-1.10.0/.gitignore000064400000000000000000000000231046102023000136770ustar 00000000000000/target **/*.rs.bk virtiofsd-1.10.0/50-virtiofsd.json000064400000000000000000000001451046102023000150420ustar 00000000000000{ "description": "virtiofsd vhost-user-fs", "type": "fs", "binary": "/usr/libexec/virtiofsd" } virtiofsd-1.10.0/CONTRIBUTING.md000064400000000000000000000052721046102023000141530ustar 00000000000000# Contributing to virtiofsd virtiofsd is an open source project licensed under the [Apache v2 License](https://opensource.org/licenses/Apache-2.0) and the [BSD 3 Clause](https://opensource.org/licenses/BSD-3-Clause) license. ## Coding Style We follow the [Rust Style](https://github.com/rust-dev-tools/fmt-rfcs/blob/master/guide/guide.md) convention and enforce it through the Continuous Integration (CI) process calling into `rustfmt` for each submitted Pull Request (PR). ## Certificate of Origin In order to get a clear contribution chain of trust we use the [signed-off-by language](https://01.org/community/signed-process) used by the Linux kernel project. ## Patch format Beside the signed-off-by footer, we expect each patch to comply with the following format: ``` Change summary More detailed explanation of your changes: Why and how. Wrap it to 72 characters. See http://chris.beams.io/posts/git-commit/ for some more good pieces of advice. Signed-off-by: ``` For example: ``` Implement support for optional sandboxing Implement support for setting up a sandbox for running the service. The technique for this has been borrowed from virtiofsd, and consists on switching to new PID, mount and network namespaces, and then switching root to the directory to be shared. Future patches will implement additional hardening features like dropping capabilities and seccomp filters. Signed-off-by: Sergio Lopez ``` ## Pull requests virtiofsd uses the “fork-and-merge” development model. Follow these steps if you want to merge your changes to `virtiofsd`: 1. Fork the [virtiofsd](https://gitlab.com/virtio-fs/virtiofsd) project into your GitLab organization. 2. Within your fork, create a branch for your contribution. 3. [Create a merge request](https://docs.gitlab.com/ee/user/project/merge_requests/creating_merge_requests.html) against the master branch of the virtiofsd repository. 4. Once the merge request is approved, one of the maintainers will merge it. ## Issue tracking If you have a problem, please let us know. We recommend using [gitlab issues](https://gitlab.com/virtio-fs/virtiofsd/-/issues/new) for formally reporting and documenting them. You can also contact us via email through the [virtio-fs mailing list](https://www.redhat.com/mailman/listinfo/virtio-fs). ## Closing issues You can either close issues manually by adding the fixing commit SHA1 to the issue comments or by adding the `Fixes` keyword to your commit message. After the corresponding MR is merged, GitLab will automatically close that issue when parsing the [commit message](https://docs.gitlab.com/ee/user/project/issues/managing_issues.html#closing-issues-automatically). virtiofsd-1.10.0/Cargo.lock0000644000000472460000000000100111140ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "aho-corasick" version = "0.7.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" dependencies = [ "memchr", ] [[package]] name = "anstream" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", "is-terminal", "utf8parse", ] [[package]] name = "anstyle" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a30da5c5f2d5e72842e00bcb57657162cdabef0931f40e2deb9b4140440cecd" [[package]] name = "anstyle-parse" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" dependencies = [ "windows-sys", ] [[package]] name = "anstyle-wincon" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188" dependencies = [ "anstyle", "windows-sys", ] [[package]] name = "arc-swap" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c5d78ce20460b82d3fa150275ed9d55e21064fc7951177baacf86a145c4a4b1f" [[package]] name = "atty" version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" dependencies = [ "hermit-abi 0.1.19", "libc", "winapi", ] [[package]] name = "autocfg" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "bitflags" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" [[package]] name = "capng" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6f8e9448233603643e42606121d95f5f8d4e015b3e7619a51593864dd902575" dependencies = [ "bitflags 1.3.2", "libc", ] [[package]] name = "cc" version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" version = "4.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1640e5cc7fb47dbb8338fd471b105e7ed6c3cb2aeb00c2e067127ffd3764a05d" dependencies = [ "clap_builder", "clap_derive", "once_cell", ] [[package]] name = "clap_builder" version = "4.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "98c59138d527eeaf9b53f35a77fcc1fad9d883116070c63d5de1c7dc7b00c72b" dependencies = [ "anstream", "anstyle", "clap_lex", "strsim", ] [[package]] name = "clap_derive" version = "4.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8cd2b2a819ad6eec39e8f1d6b53001af1e5469f8c177579cdaeb313115b825f" dependencies = [ "heck", "proc-macro2", "quote", "syn 2.0.23", ] [[package]] name = "clap_lex" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" [[package]] name = "colorchoice" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" [[package]] name = "env_logger" version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a19187fea3ac7e84da7dacf48de0c45d63c6a76f9490dae389aead16c243fce3" dependencies = [ "atty", "humantime", "log", "regex", "termcolor", ] [[package]] name = "errno" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" dependencies = [ "errno-dragonfly", "libc", "windows-sys", ] [[package]] name = "errno-dragonfly" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" dependencies = [ "cc", "libc", ] [[package]] name = "error-chain" version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d2f06b9cac1506ece98fe3231e3cc9c4410ec3d5b1f24ae1c8946f0742cdefc" dependencies = [ "version_check", ] [[package]] name = "futures" version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f73fe65f54d1e12b726f517d3e2135ca3125a437b6d998caf1962961f7172d9e" dependencies = [ "futures-channel", "futures-core", "futures-executor", "futures-io", "futures-sink", "futures-task", "futures-util", ] [[package]] name = "futures-channel" version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3083ce4b914124575708913bca19bfe887522d6e2e6d0952943f5eac4a74010" dependencies = [ "futures-core", "futures-sink", ] [[package]] name = "futures-core" version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3" [[package]] name = "futures-executor" version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9420b90cfa29e327d0429f19be13e7ddb68fa1cccb09d65e5706b8c7a749b8a6" dependencies = [ "futures-core", "futures-task", "futures-util", "num_cpus", ] [[package]] name = "futures-io" version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc4045962a5a5e935ee2fdedaa4e08284547402885ab326734432bed5d12966b" [[package]] name = "futures-macro" version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512" dependencies = [ "proc-macro2", "quote", "syn 1.0.98", ] [[package]] name = "futures-sink" version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "21163e139fa306126e6eedaf49ecdb4588f939600f0b1e770f4205ee4b7fa868" [[package]] name = "futures-task" version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c66a976bf5909d801bbef33416c41372779507e7a6b3a5e25e4749c58f776a" [[package]] name = "futures-util" version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a" dependencies = [ "futures-channel", "futures-core", "futures-io", "futures-macro", "futures-sink", "futures-task", "memchr", "pin-project-lite", "pin-utils", "slab", ] [[package]] name = "heck" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "hermit-abi" version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" dependencies = [ "libc", ] [[package]] name = "hermit-abi" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" [[package]] name = "hostname" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c731c3e10504cc8ed35cfe2f1db4c9274c3d35fa486e3b31df46f068ef3e867" dependencies = [ "libc", "match_cfg", "winapi", ] [[package]] name = "humantime" version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "is-terminal" version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ "hermit-abi 0.3.2", "rustix", "windows-sys", ] [[package]] name = "itoa" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112c678d4050afce233f4f2852bb2eb519230b3cf12f33585275537d7e41578d" [[package]] name = "libc" version = "0.2.147" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" [[package]] name = "libseccomp-sys" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a7cbbd4ad467251987c6e5b47d53b11a5a05add08f2447a9e2d70aef1e0d138" [[package]] name = "linux-raw-sys" version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57bcfdad1b858c2db7c38303a6d2ad4dfaf5eb53dfeb0910128b2c26d6158503" [[package]] name = "log" version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" dependencies = [ "cfg-if", ] [[package]] name = "match_cfg" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffbee8634e0d45d258acb448e7eaab3fce7a0a467395d4d9f228e3c1f01fb2e4" [[package]] name = "memchr" version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "num_cpus" version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" dependencies = [ "hermit-abi 0.1.19", "libc", ] [[package]] name = "num_threads" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" dependencies = [ "libc", ] [[package]] name = "once_cell" version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "pin-project-lite" version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" [[package]] name = "pin-utils" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "proc-macro2" version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b368fba921b0dce7e60f5e04ec15e565b3303972b42bcfde1d0713b881959eb" dependencies = [ "unicode-ident", ] [[package]] name = "quote" version = "1.0.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "573015e8ab27661678357f27dc26460738fd2b6c86e46f386fde94cb5d913105" dependencies = [ "proc-macro2", ] [[package]] name = "regex" version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b" dependencies = [ "aho-corasick", "memchr", "regex-syntax", ] [[package]] name = "regex-syntax" version = "0.6.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" [[package]] name = "rustix" version = "0.38.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "172891ebdceb05aa0005f533a6cbfca599ddd7d966f6f5d4d9b2e70478e70399" dependencies = [ "bitflags 2.4.1", "errno", "libc", "linux-raw-sys", "windows-sys", ] [[package]] name = "slab" version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef" dependencies = [ "autocfg", ] [[package]] name = "strsim" version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "syn" version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c50aef8a904de4c23c788f104b7dddc7d6f79c647c7c8ce4cc8f73eb0ca773dd" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "syn" version = "2.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "59fb7d6d8281a51045d62b8eb3a7d1ce347b76f312af50cd3dc0af39c87c1737" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "syslog" version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "978044cc68150ad5e40083c9f6a725e6fd02d7ba1bcf691ec2ff0d66c0b41acc" dependencies = [ "error-chain", "hostname", "libc", "log", "time", ] [[package]] name = "termcolor" version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755" dependencies = [ "winapi-util", ] [[package]] name = "thiserror" version = "1.0.41" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c16a64ba9387ef3fdae4f9c1a7f07a0997fce91985c0336f1ddc1822b3b37802" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" version = "1.0.41" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d14928354b01c4d6a4f0e549069adef399a284e7995c7ccca94e8a07a5346c59" dependencies = [ "proc-macro2", "quote", "syn 2.0.23", ] [[package]] name = "time" version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72c91f41dcb2f096c05f0873d667dceec1087ce5bcf984ec8ffb19acddbb3217" dependencies = [ "itoa", "libc", "num_threads", ] [[package]] name = "unicode-ident" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "15c61ba63f9235225a22310255a29b806b907c9b8c964bcbd0a2c70f3f2deea7" [[package]] name = "utf8parse" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "version_check" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" [[package]] name = "vhost" version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b64e816d0d49769fbfaa1494eb77cc2a3ddc526ead05c7f922cb7d64106286f" dependencies = [ "bitflags 2.4.1", "libc", "vm-memory", "vmm-sys-util", ] [[package]] name = "vhost-user-backend" version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72c8c447d076ac508d78cb45664d203df7989e891656dce260a7e93d72352c9a" dependencies = [ "libc", "log", "vhost", "virtio-bindings", "virtio-queue", "vm-memory", "vmm-sys-util", ] [[package]] name = "virtio-bindings" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "878bcb1b2812a10c30d53b0ed054999de3d98f25ece91fc173973f9c57aaae86" [[package]] name = "virtio-queue" version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3f69a13d6610db9312acbb438b0390362af905d37634a2106be70c0f734986d" dependencies = [ "log", "virtio-bindings", "vm-memory", "vmm-sys-util", ] [[package]] name = "virtiofsd" version = "1.10.0" dependencies = [ "bitflags 1.3.2", "capng", "clap", "env_logger", "futures", "libc", "libseccomp-sys", "log", "syslog", "vhost", "vhost-user-backend", "virtio-bindings", "virtio-queue", "vm-memory", "vmm-sys-util", ] [[package]] name = "vm-memory" version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74ffc42216c32c35f858fa4bfdcd9b61017dfd691e0240268fdc85dbf59e5459" dependencies = [ "arc-swap", "bitflags 2.4.1", "libc", "thiserror", "vmm-sys-util", "winapi", ] [[package]] name = "vmm-sys-util" version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d1435039746e20da4f8d507a72ee1b916f7b4b05af7a91c093d2c6561934ede" dependencies = [ "bitflags 1.3.2", "libc", ] [[package]] name = "winapi" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" dependencies = [ "winapi-i686-pc-windows-gnu", "winapi-x86_64-pc-windows-gnu", ] [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" dependencies = [ "winapi", ] [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-sys" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ "windows-targets", ] [[package]] name = "windows-targets" version = "0.48.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", "windows_i686_gnu", "windows_i686_msvc", "windows_x86_64_gnu", "windows_x86_64_gnullvm", "windows_x86_64_msvc", ] [[package]] name = "windows_aarch64_gnullvm" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" [[package]] name = "windows_aarch64_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" [[package]] name = "windows_i686_gnu" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" [[package]] name = "windows_i686_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" [[package]] name = "windows_x86_64_gnu" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" [[package]] name = "windows_x86_64_gnullvm" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" [[package]] name = "windows_x86_64_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" virtiofsd-1.10.0/Cargo.toml0000644000000032720000000000100111260ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2018" name = "virtiofsd" version = "1.10.0" authors = ["The Virtiofs Project Developers"] exclude = [".gitlab-ci.yml"] description = "A virtio-fs vhost-user device daemon" homepage = "https://virtio-fs.gitlab.io/" readme = "README.md" license = "Apache-2.0 AND BSD-3-Clause" repository = "https://gitlab.com/virtio-fs/virtiofsd" [profile.release] lto = true [dependencies.bitflags] version = "1.2" [dependencies.capng] version = "0.2.2" [dependencies.clap] version = "4" features = ["derive"] [dependencies.env_logger] version = "0.8.4" [dependencies.futures] version = "0.3" features = ["thread-pool"] [dependencies.libc] version = "0.2.139" [dependencies.libseccomp-sys] version = "0.2" [dependencies.log] version = "0.4" [dependencies.syslog] version = "6.0" [dependencies.vhost] version = "0.10.0" [dependencies.vhost-user-backend] version = "0.13.1" [dependencies.virtio-bindings] version = "0.2.1" features = ["virtio-v5_0_0"] [dependencies.virtio-queue] version = "0.11.0" [dependencies.vm-memory] version = "0.14.0" features = [ "backend-mmap", "backend-atomic", ] [dependencies.vmm-sys-util] version = "0.12.1" [features] xen = [ "vhost-user-backend/xen", "vhost/xen", "vm-memory/xen", ] virtiofsd-1.10.0/Cargo.toml.orig000064400000000000000000000017521046102023000146100ustar 00000000000000[package] name = "virtiofsd" description = "A virtio-fs vhost-user device daemon" version = "1.10.0" authors = ["The Virtiofs Project Developers"] edition = "2018" homepage = "https://virtio-fs.gitlab.io/" repository = "https://gitlab.com/virtio-fs/virtiofsd" license = "Apache-2.0 AND BSD-3-Clause" readme = "README.md" exclude = [".gitlab-ci.yml"] [features] # Enabling Xen support will _disable_ QEMU/KVM support! xen = ["vhost-user-backend/xen", "vhost/xen", "vm-memory/xen"] [dependencies] bitflags = "1.2" capng = "0.2.2" env_logger = "0.8.4" futures = { version = "0.3", features = ["thread-pool"] } libc = "0.2.139" log = "0.4" libseccomp-sys = "0.2" clap = { version = "4", features = ["derive"] } vhost-user-backend = "0.13.1" vhost = "0.10.0" virtio-bindings = { version = "0.2.1", features = ["virtio-v5_0_0"] } vm-memory = { version = "0.14.0", features = ["backend-mmap", "backend-atomic"] } virtio-queue = "0.11.0" vmm-sys-util = "0.12.1" syslog = "6.0" [profile.release] lto = true virtiofsd-1.10.0/LICENSE-APACHE000064400000000000000000000261361046102023000136500ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. virtiofsd-1.10.0/LICENSE-BSD-3-Clause000064400000000000000000000030321046102023000146770ustar 00000000000000// Copyright 2017 The Chromium OS Authors. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. virtiofsd-1.10.0/README.md000064400000000000000000000404251046102023000132000ustar 00000000000000# virtiofsd A [virtio-fs](https://virtio-fs.gitlab.io/) vhost-user device daemon written in Rust. ## Building from sources ### Requirements This project depends on [libcap-ng](https://people.redhat.com/sgrubb/libcap-ng/) and [libseccomp](https://github.com/seccomp/libseccomp). You can obtain those dependencies by building them for their respective sources, or by installing the correspondent development packages from your distribution, if available: - Fedora/CentOS/RHEL ```shell dnf install libcap-ng-devel libseccomp-devel ``` - Debian/Ubuntu ```shell apt install libcap-ng-dev libseccomp-dev ``` ### Compiling virtiofsd is written in Rust, so you will have to install [Rust](https://www.rust-lang.org/learn/get-started) in order to compile it, and it uses [cargo](https://doc.rust-lang.org/cargo/) to manage the project and its dependencies. After installing Rust, you can compile it to a binary by running: ```shell cargo build --release ``` ## CI-built binaries Every time new code is merged, the CI pipeline will upload a debug binary of virtiofsd. It is intended to be an accessible way for anyone to download and test virtiofsd without needing a Rust toolchain installed. The debug binary is built only for x86\_64 Linux-based systems. [Click here to download the latest build]( https://gitlab.com/virtio-fs/virtiofsd/-/jobs/artifacts/main/download?job=publish) ## Contributing See [CONTRIBUTING.md](CONTRIBUTING.md) ## Usage This program must be run as the root user or as a "fake" root inside a user namespace (see [Running as non-privileged user](#running-as-non-privileged-user)). The program drops privileges where possible during startup, although it must be able to create and access files with any uid/gid: * The ability to invoke syscalls is limited using `seccomp(2)`. * Linux `capabilities(7)` are dropped. virtiofsd only retains the following capabilities: `CAP_CHOWN`, `CAP_DAC_OVERRIDE`, `CAP_FOWNER`, `CAP_FSETID`, `CAP_SETGID`, `CAP_SETUID`, `CAP_MKNOD`, `CAP_SETFCAP` (and `CAP_DAC_READ_SEARCH` if `--inode-file-handles` is used). ```shell virtiofsd [FLAGS] [OPTIONS] --fd |--socket-path --shared-dir ``` #### Flags ```shell -h, --help ``` Prints help information. ```shell -V, --version ``` Prints version information. ```shell --syslog ``` Log to syslog. Default: stderr. ```shell --print-capabilities ``` Print vhost-user.json backend program capabilities and exit. ```shell --allow-direct-io ``` Honor the `O_DIRECT` flag passed down by guest applications. ```shell --announce-submounts ``` Tell the guest which directories are mount points. If multiple filesystems are mounted in the shared directory, virtiofsd passes inode IDs directly to the guest, and because such IDs are unique only on a single filesystem, it is possible that the guest will encounter duplicates if multiple filesystems are mounted in the shared directory. `--announce-submounts` solves that problem because it reports a different device number for every submount it encounters. In addition, when running with `--announce-submounts`, the client sends one `SYNCFS` request per submount that is to be synced, so virtiofsd will call `syncfs()` on each submount. On the other hand, when running without `--announce-submounts`, the client only sends a `SYNCFS` request for the root mount, this may lead to data loss/corruption. ```shell --no-killpriv-v2 ``` Disable `KILLPRIV V2` support. This is required if the shared directory is an NFS file system. `KILLPRIV V2` support is disabled by default. ```shell --killpriv-v2 ``` Enable `KILLPRIV V2` support. It is disabled by default. ```shell --no-readdirplus ``` Disable support for `READDIRPLUS` operations. ```shell --writeback ``` Enable writeback cache. ```shell --xattr ``` Enable support for extended attributes. ```shell --posix-acl ``` Enable support for posix ACLs (implies --xattr). ```shell --security-label ``` Enable support for security label (SELinux). ```shell --preserve-noatime ``` Always preserve `O_NOATIME`. By default virtiofsd will implicitly clean up `O_NOATIME` to prevent potential permission errors when it does not have the right capabilities to access all the exported files (typically when running as unprivileged user and with `--sandbox none`, that means it won't have the `CAP_FOWNER` capability set). The option `--preserve-noatime` can be used to override this behavior and preserve the `O_NOATIME` flag specified by the client. #### Options ```shell --shared-dir ``` Shared directory path. ```shell --tag ``` The tag that the virtio device advertises. Setting this option will enable advertising of VHOST_USER_PROTOCOL_F_CONFIG. However, the vhost-user frontend of your hypervisor may not negotiate this feature and (or) ignore this value. Notably, QEMU currently (as of 8.1) ignores the CONFIG feature. QEMU versions from 7.1 to 8.0 will crash while attempting to log a warning about not supporting the feature. ```shell --socket-group ``` Name of group for the vhost-user socket. ```shell --socket-path ``` vhost-user socket path. ```shell --fd ``` File descriptor for the listening socket. ```shell --log-level ``` Log level (error, warn, info, debug, trace, off). Default: info. ```shell --thread-pool-size ``` Maximum thread pool size. A value of "0" disables the pool. Default: 0. ```shell --rlimit-nofile ``` Set maximum number of file descriptors. If the soft limit is greater than 1M or `--rlimit-nofile=0` is passed as parameter, the maximum number of file descriptors is not changed. Default: min(1000000, `/proc/sys/fs/nr_open`). ```shell --modcaps= ``` Modify the list of capabilities, e.g., `--modcaps=+sys_admin:-chown`. Although it is not mandatory, it is recommended to always use the `=` sign, in other case, this will fail `--modcaps -mknod`, because it will be interpreted as two options, instead of the intended `--modcaps=-mknod`. ```shell --sandbox ``` Sandbox mechanism to isolate the daemon process (namespace, chroot, none). - **namespace**: The program switches into a new file system namespace (`namespaces(7)`) and invokes `pivot_root(2)` to make the shared directory tree its root. A new mount (`mount_namespaces(7)`), pid (`pid_namespaces(7)`) and net namespace (`network_namespaces(7)`) is also created to isolate the process. - **chroot**: The program invokes `chroot(2)` to make the shared directory tree its root. This mode is intended for container environments where the container runtime has already set up the namespaces and the program does not have permission to create namespaces itself. - **none**: Do not isolate the daemon (not recommended). Both **namespace** and **chroot** sandbox modes prevent "file system escapes" due to symlinks and other file system objects that might lead to files outside the shared directory. Default: namespace. ```shell --seccomp ``` Action to take when seccomp finds a not allowed syscall (none, kill, log, trap). Default: kill. ```shell --cache ``` The caching policy the file system should use (auto, always, metadata, never). Default: auto. ```shell --inode-file-handles= ``` When to use file handles to reference inodes instead of `O_PATH` file descriptors (never, prefer, mandatory). - **never**: Never use file handles, always use `O_PATH` file descriptors. - **prefer**: Attempt to generate file handles, but fall back to `O_PATH` file descriptors where the underlying filesystem does not support file handles or `CAP_DAC_READ_SEARCH` is not available. Useful when there are various different filesystems under the shared directory and some of them do not support file handles. - **mandatory**: Always use file handles. It will fail if the underlying filesystem does not support file handles or `CAP_DAC_READ_SEARCH` is not available. Using file handles reduces the number of file descriptors virtiofsd keeps open, which is not only helpful with resources, but may also be important in cases where virtiofsd should only have file descriptors open for files that are open in the guest, e.g. to get around bad interactions with NFS's silly renaming (see [NFS FAQ, Section D2: "What is a "silly rename"?"](http://nfs.sourceforge.net/)). Default: never. ```shell --xattrmap ``` Add custom rules for translating extended attributes between host and guest (e.g., `:map::user.virtiofs.:`). For additional details please see [Extended attribute mapping](doc/xattr-mapping.md). ```shell --uid-map=:namespace_uid:host_uid:count: ``` When running virtiofsd as non-root, map a range of UIDs from host to namespace. In order to use this option, the range of subordinate user IDs must have been set up via `subuid(5)`. virtiofsd uses `newuidmap(1)`, that requires a valid subuid, to do the mapping. If this option is not provided, virtiofsd will set up a 1-to-1 mapping for current uid. namespace_uid: Beginning of the range of UIDs inside the user namespace. host_uid: Beginning of the range of UIDs outside the user namespace. count: Length of the ranges (both inside and outside the user namespace). For instance, let's assume the invoking UID is 1000 and the content of /etc/subuid is: 1000:100000:65536, which creates 65536 subuids starting at 100000, i.e. the (inclusive) range [100000, 165535], belonging to the actual UID 1000. This range can be mapped to the UIDs [0, 65535] in virtiofsd’s user namespace (i.e. as seen in the guest) via --uid-map=:0:100000:65536:. Alternatively, you can simply map your own UID to a single UID in the namespace: For example, --uid-map=:0:1000:1: would map UID 1000 to root’s UID in the namespace (and thus the guest). ```shell --gid-map=:namespace_gid:host_gid:count: ``` When running virtiofsd as non-root, map a range of GIDs from host to namespace. In order to use this option, the range of subordinate group IDs must have been set up via `subgid(5)`. virtiofsd uses `newgidmap(1)`, that requires a valid subgid, to do the mapping. If this option is not provided, virtiofsd will set up a 1-to-1 mapping for current gid. namespace_gid: Beginning of the range of GIDs inside the user namespace. host_gid: Beginning of the range of GIDs outside the user namespace. count: Length of the ranges (both inside and outside the user namespace). For instance, let's assume the invoking GID is 1000 and the content of /etc/subgid is: 1000:100000:65536, which creates 65536 subgids starting at 100000, i.e. the (inclusive) range [100000, 165535], belonging to the actual GID 1000. This range can be mapped to the GIDs [0, 65535] in virtiofsd’s user namespace (i.e. as seen in the guest) via --gid-map=:0:100000:65536:. Alternatively, you can simply map your own GID to a single GID in the namespace: For example, --gid-map=:0:1000:1: would map GID 1000 to root’s GID in the namespace (and thus the guest). ### Examples Export `/mnt` on vhost-user UNIX domain socket `/tmp/vfsd.sock`: ```shell host# virtiofsd --socket-path=/tmp/vfsd.sock --shared-dir /mnt \ --announce-submounts --inode-file-handles=mandatory & host# qemu-system \ -blockdev file,node-name=hdd,filename= \ -device virtio-blk,drive=hdd \ -chardev socket,id=char0,path=/tmp/vfsd.sock \ -device vhost-user-fs-pci,queue-size=1024,chardev=char0,tag=myfs \ -object memory-backend-memfd,id=mem,size=4G,share=on \ -numa node,memdev=mem \ -accel kvm -m 4G guest# mount -t virtiofs myfs /mnt ``` See [FAQ](#faq) for adding virtiofs config to an existing qemu command-line. ### Running as non-privileged user When run without root, virtiofsd requires a user namespace (see `user_namespaces(7)`) to be able to switch between arbitrary user/group IDs within the guest. virtiofsd will fail in a user namespace where UIDs/GIDs have not been mapped (i.e., `uid_map` and `gid_map` files have not been written). There are many options to run virtiofsd inside a user namespace. For instance: Let's assume the invoking UID and GID is 1000 and the content of both `/etc/subuid` and `/etc/subgid` are: ``` 1000:100000:65536 ``` Using `podman-unshare(1)` the user namespace will be configured so that the invoking user's UID and primary GID (i.e., 1000) appear to be UID 0 and GID 0, respectively. Any ranges which match that user and group in `/etc/subuid` and `/etc/subgid` are also mapped in as themselves with the help of the `newuidmap(1)` and `newgidmap(1)` helpers: ```shell host$ podman unshare -- virtiofsd --socket-path=/tmp/vfsd.sock --shared-dir /mnt \ --announce-submounts --sandbox chroot & ``` Using `lxc-usernsexec(1)`, we could leave the invoking user outside the mapping, having the root user inside the user namespace mapped to the user and group 100000: ```shell host$ lxc-usernsexec -m b:0:100000:65536 -- virtiofsd --socket-path=/tmp/vfsd.sock \ --shared-dir /mnt --announce-submounts --sandbox chroot & ``` In order to have the same behavior as `podman-unshare(1)`, we need to run ```shell host$ lxc-usernsexec -m b:0:1000:1 -m b:1:100000:65536 -- virtiofsd --socket-path=/tmp/vfsd.sock \ --shared-dir /mnt --announce-submounts --sandbox chroot & ``` We could also select `--sandbox none` instead of `--sandbox chroot`. #### Limitations - Within the guest, it is not possible to create block or char device nodes in the shared directory. - virtiofsd can't use file handles (`--inode-file-handles` requires `CAP_DAC_READ_SEARCH`), so a large number of file descriptors is required. Additionally, on NFS, not using file handles may result in a hidden file lingering after some file is deleted (see [NFS FAQ, Section D2: "What is a "silly rename"?"](http://nfs.sourceforge.net/)). - virtiofsd will not be able to increase `RLIMIT_NOFILE`. ## FAQ - How to read-only-share a directory that cannot be modified within the guest? To accomplish this you need to export a read-only mount point, for instance, exporting `share`: ```shell mkdir ro-share mount -o bind,ro share ro-share virtiofsd --shared-dir ro-share ... ``` - How to share multiple directories with the same virtiofsd? Currently, virtiofsd only supports sharing a single directory, but it is possible to use submounts to achieve this, for instance, exporting `share0`, `share1`: ```shell mkdir -p share/{sh0,sh1} mount -o bind share0 share/sh0 mount -o bind share1 share/sh1 virtiofsd --announce-submounts --shared-dir share ... ``` Note the use of `--announce-submounts` to prevent data loss/corruption. - How to add virtiofs devices to an existing qemu command-line: If `-object memory-backend-memfd,id=mem` and either `-numa node,memdev=mem` or a `memory-backend=mem` property in the `-machine` option have not already been added to the command, add them. If a different memory backend is already configured then it should be changed to `memory-backend-memfd`. `-object memory-backend-memfd` **must** have the option `share=on` and `size=` **must** match the memory size defined by `-m`. For each virtiofs device mount add a `-chardev socket,id=${MATCHING_ID},path=${VIRTIOFSD_SOCKET_PATH}` and `-device vhost-user-fs-pci,queue-size=1024,chardev=${MATCHING_ID},tag=${VIRTIOFS_TAG}` substituting appropriate values for the shell-style variables. ## SELinux Support One can enable support for SELinux by running virtiofsd with option "--security-label". But this will try to save guest's security context in xattr security.selinux on host and it might fail if host's SELinux policy does not permit virtiofsd to do this operation. Hence, it is recommended to remap guest's "security.selinux" xattr to say "trusted.virtiofs.security.selinux" on host. Add following option to command line. "--xattrmap=:map:security.selinux:trusted.virtiofs.:" This will make sure that guest and host's SELinux xattrs on same file remain separate and not interfere with each other. And will allow both host and guest to implement their own separate SELinux policies. Setting trusted xattr on host requires CAP_SYS_ADMIN. So one will need add this capability to daemon. Add following option to command line. "--modcaps=+sys_admin" trusted xattrs are not namespaced. So virtiofsd needs to have CAP_SYS_ADMIN in init_user_ns. IOW, one should not be using user namespaces and virtiofsd should run with CAP_SYS_ADMIN. Giving CAP_SYS_ADMIN increases the risk on system. Now virtiofsd is more powerful and if gets compromised, it can do lot of damage to host system. So keep this trade-off in my mind while making a decision. virtiofsd-1.10.0/doc/xattr-mapping.md000064400000000000000000000172331046102023000156040ustar 00000000000000# Extended attribute (xattr) mapping By default, the name of xattrs used by the client are passed through to the server file system. This can be a problem where either those xattr names are used by something on the server (e.g. selinux client/server confusion) or if the virtiofsd is running in a container with restricted privileges where it cannot access some attributes. ## Mapping syntax A mapping of xattr names can be made using `--xattrmap=` where the `` string consists of a series of rules. When looking for a mapping, the first matching rule applies. There *must* be a mapping for every xattr name in the list of rules, for example by making the final rule a catch-all rule to match any remaining attributes. Each rule consists of a number of fields separated with a separator that is the first non-white space character in the rule. This separator must then be used for the whole rule. White space may be added before and after each rule. Using `:` as the separator a rule is of the form: ``` :type:scope:key:prepend: ``` **scope** is one of: - `client`: Match **key** against an xattr name from the client for setxattr/getxattr/removexattr - `server`: Match **prepend** against an xattr name from the server for listxattr - `all`: Can be used to make a single rule where both the server and client matches are triggered. **type** is one of: - `prefix`: Is designed to prepend and strip a prefix; the modified attributes then being passed on to the client/server. - `ok`: Causes the rule set to be terminated when a match is found while allowing matching xattrs through unchanged. It is intended both as a way of explicitly terminating the list of rules, and to allow some xattrs to skip following rules. - `bad`: If a client tries to use a name matching **key** it's denied using `EPERM`; when the server passes an attribute name matching **prepend** it's hidden. In many ways its use is very like the `ok` type as either an explicit terminator or for special handling of certain patterns. - `unsupported`: If a client tries to use a name matching **key** it's denied using `ENOTSUP`; when the server passes an attribute name matching **prepend** it's hidden. In many ways its use is very like the `ok` type as either an explicit terminator or for special handling of certain patterns. **key** is a string tested as a prefix on an attribute name originating on the client. It may be empty in which case a `client` scoped rule will always match on client names. **prepend** is a string tested as a prefix on an attribute name originating on the server, and used as a new prefix. It may be empty in which case a `server` scoped rule will always match on all names from the server. e.g.: | Mapping rule | Description | | ----------------------------------------- | ---------------------------------------------------------------------------------------------------- | | `:prefix:client:trusted.:user.virtiofs.:` | will match `trusted.*` attributes in client calls and prefix them before passing them to the server. | | `:prefix:server::user.virtiofs.:` | will strip `user.virtiofs.` from all server replies. | | `:prefix:all:trusted.:user.virtiofs.:` | combines the previous two cases into a single rule. | | `:ok:client:user.::` | will allow get/set xattr for `user.` xattrs. | | `:ok:server::security.:` | will pass `security.` xattrs in listxattr from the server. | | `:ok:all:::` | will terminate the rule search passing any remaining attributes in both directions. | | `:bad:server::security.:` | would hide `security.` xattrs in listxattr from the server. | A simpler **map** type provides a shorter syntax for the common case: ``` :map:key:prepend: ``` The `map` type adds a number of separate rules to add **prepend** as a prefix to the matched **key** (or all attributes if **key** is empty). There may be at most one `map` rule, and it must be the last rule in the set. Please note that when the `security.capability` xattr is remapped, the daemon has to do extra work to remove it during many operations, which the host kernel normally does itself. ## Security considerations Operating systems typically partition the xattr namespace using well-defined name prefixes. Each partition may have different access controls applied. For example, on Linux there are multiple partitions - `system.*`: access varies depending on attribute and filesystem - `security.*`: only processes with `CAP_SYS_ADMIN` - `trusted.*`: only processes with `CAP_SYS_ADMIN` - `user.*`: any process granted by file permissions / ownership While other OS such as FreeBSD have different name prefixes and access control rules. When remapping attributes on the host, it is important to ensure that the remapping does not allow a guest user to evade the guest access control rules. Consider if `trusted.*` from the guest was remapped to `user.virtiofs.trusted.*` in the host. An unprivileged user in a Linux guest has the ability to write to xattrs under `user.*`. Thus the user can evade the access control restriction on `trusted.*` by instead writing to `user.virtiofs.trusted.*`. As noted above, the partitions used and access controls applied, will vary across guest OS, so it is not wise to try to predict what the guest OS will use. The simplest way to avoid an insecure configuration is to remap all xattrs at once, to a given fixed prefix. This is shown in example (1) below. If selectively mapping only a subset of xattr prefixes, then rules must be added to explicitly block direct access to the target of the remapping. This is shown in example (2) below. ## Mapping examples 1. Prefix all attributes with `user.virtiofs.` ```shell --xattrmap=":prefix:all::user.virtiofs.::bad:all:::" ``` This uses two rules, using : as the field separator; the first rule prefixes and strips `user.virtiofs.`, the second rule hides any non-prefixed attributes that the host set. This is equivalent to the `map` rule: ```shell --xattrmap=":map::user.virtiofs.:" ``` 2. Prefix `trusted.` attributes, allow others through ```shell --xattrmap="/prefix/all/trusted./user.virtiofs./ /bad/server//trusted./ /bad/client/user.virtiofs.// /ok/all///" ``` (each rule is on a single line just for the sake of clarity) Here there are four rules, using `/` as the field separator, and also demonstrating that new lines can be included between rules. The first rule is the prefixing of `trusted.` and stripping of `user.virtiofs.`. The second rule hides unprefixed `trusted.` attributes on the host. The third rule stops a guest from explicitly setting the `user.virtiofs.` path directly to prevent access control bypass on the target of the earlier prefix remapping. Finally, the fourth rule lets all remaining attributes through. This is equivalent to the `map` rule: ```shell --xattrmap="/map/trusted./user.virtiofs./" ``` 3. Hide `security.` attributes, and allow everything else ```shell --xattrmap="/bad/all/security./security./ /ok/all///" ``` The first rule combines what could be separate client and server rules into a single `all` rule, matching `security.` in either client arguments or lists returned from the host. This prevents the client from seeing and/or setting any `security.` attributes on the server.virtiofsd-1.10.0/rustfmt.toml000064400000000000000000000000601046102023000143110ustar 00000000000000imports_granularity = "Module" edition = "2018" virtiofsd-1.10.0/src/descriptor_utils.rs000064400000000000000000001003201046102023000164430ustar 00000000000000// Copyright 2019 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use std::collections::VecDeque; use std::fmt::{self, Display}; use std::io::{self, Read, Write}; use std::mem::{size_of, MaybeUninit}; use std::ops::Deref; use std::ptr::copy_nonoverlapping; use std::{cmp, result}; use virtio_queue::DescriptorChain; use vm_memory::bitmap::{Bitmap, BitmapSlice}; use vm_memory::{ Address, ByteValued, GuestMemory, GuestMemoryError, GuestMemoryMmap, GuestMemoryRegion, Le16, Le32, Le64, VolatileMemory, VolatileMemoryError, VolatileSlice, }; use crate::file_traits::FileReadWriteAtVolatile; use crate::oslib; #[derive(Debug)] pub enum Error { DescriptorChainOverflow, FindMemoryRegion, GuestMemoryError(GuestMemoryError), InvalidChain, IoError(io::Error), SplitOutOfBounds(usize), VolatileMemoryError(VolatileMemoryError), } impl Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use self::Error::*; match self { DescriptorChainOverflow => write!( f, "the combined length of all the buffers in a `DescriptorChain` would overflow" ), FindMemoryRegion => write!(f, "no memory region for this address range"), GuestMemoryError(e) => write!(f, "descriptor guest memory error: {e}"), InvalidChain => write!(f, "invalid descriptor chain"), IoError(e) => write!(f, "descriptor I/O error: {e}"), SplitOutOfBounds(off) => write!(f, "`DescriptorChain` split is out of bounds: {off}"), VolatileMemoryError(e) => write!(f, "volatile memory error: {e}"), } } } pub type Result = result::Result; impl std::error::Error for Error {} #[derive(Clone)] struct DescriptorChainConsumer<'a, B> { buffers: VecDeque>, bytes_consumed: usize, } impl<'a, B: BitmapSlice> DescriptorChainConsumer<'a, B> { fn available_bytes(&self) -> usize { // This is guaranteed not to overflow because the total length of the chain // is checked during all creations of `DescriptorChainConsumer` (see // `Reader::new()` and `Writer::new()`). self.buffers .iter() .fold(0usize, |count, vs| count + vs.len()) } fn bytes_consumed(&self) -> usize { self.bytes_consumed } /// Consumes at most `count` bytes from the `DescriptorChain`. Callers must provide a function /// that takes a `&[VolatileSlice]` and returns the total number of bytes consumed. This /// function guarantees that the combined length of all the slices in the `&[VolatileSlice]` is /// less than or equal to `count`. /// /// # Errors /// /// If the provided function returns any error then no bytes are consumed from the buffer and /// the error is returned to the caller. fn consume(&mut self, count: usize, f: F) -> io::Result where F: FnOnce(&[&VolatileSlice]) -> io::Result, { let mut buflen = 0; let mut bufs = Vec::with_capacity(self.buffers.len()); for vs in &self.buffers { if buflen >= count { break; } bufs.push(vs); let rem = count - buflen; if rem < vs.len() { buflen += rem; } else { buflen += vs.len(); } } if bufs.is_empty() { return Ok(0); } let bytes_consumed = f(&bufs)?; // This can happen if a driver tricks a device into reading/writing more data than // fits in a `usize`. let total_bytes_consumed = self.bytes_consumed .checked_add(bytes_consumed) .ok_or_else(|| { io::Error::new(io::ErrorKind::InvalidData, Error::DescriptorChainOverflow) })?; let mut rem = bytes_consumed; while let Some(vs) = self.buffers.pop_front() { if rem < vs.len() { // Split the slice and push the remainder back into the buffer list. Safe because we // know that `rem` is not out of bounds due to the check and we checked the bounds // on `vs` when we added it to the buffer list. self.buffers.push_front(vs.offset(rem).unwrap()); break; } // No need for checked math because we know that `vs.size() <= rem`. rem -= vs.len(); } self.bytes_consumed = total_bytes_consumed; Ok(bytes_consumed) } fn split_at(&mut self, offset: usize) -> Result> { let mut rem = offset; let pos = self.buffers.iter().position(|vs| { if rem < vs.len() { true } else { rem -= vs.len(); false } }); if let Some(at) = pos { let mut other = self.buffers.split_off(at); if rem > 0 { // There must be at least one element in `other` because we checked // its `size` value in the call to `position` above. let front = other.pop_front().expect("empty VecDeque after split"); self.buffers .push_back(front.subslice(0, rem).map_err(Error::VolatileMemoryError)?); other.push_front(front.offset(rem).map_err(Error::VolatileMemoryError)?); } Ok(DescriptorChainConsumer { buffers: other, bytes_consumed: 0, }) } else if rem == 0 { Ok(DescriptorChainConsumer { buffers: VecDeque::new(), bytes_consumed: 0, }) } else { Err(Error::SplitOutOfBounds(offset)) } } } /// Provides high-level interface over the sequence of memory regions /// defined by readable descriptors in the descriptor chain. /// /// Note that virtio spec requires driver to place any device-writable /// descriptors after any device-readable descriptors (2.6.4.2 in Virtio Spec v1.1). /// Reader will skip iterating over descriptor chain when first writable /// descriptor is encountered. #[derive(Clone)] pub struct Reader<'a, B = ()> { buffer: DescriptorChainConsumer<'a, B>, } impl<'a, B: Bitmap + BitmapSlice + 'static> Reader<'a, B> { /// Construct a new Reader wrapper over `desc_chain`. pub fn new( mem: &'a GuestMemoryMmap, desc_chain: DescriptorChain, ) -> Result> where M: Deref, M::Target: GuestMemory + Sized, { let mut total_len: usize = 0; let buffers = desc_chain .readable() .map(|desc| { // Verify that summing the descriptor sizes does not overflow. // This can happen if a driver tricks a device into reading more data than // fits in a `usize`. total_len = total_len .checked_add(desc.len() as usize) .ok_or(Error::DescriptorChainOverflow)?; let region = mem .find_region(desc.addr()) .ok_or(Error::FindMemoryRegion)?; let offset = desc .addr() .checked_sub(region.start_addr().raw_value()) .unwrap(); region .deref() .get_slice(offset.raw_value() as usize, desc.len() as usize) .map_err(Error::VolatileMemoryError) }) .collect::>>>()?; Ok(Reader { buffer: DescriptorChainConsumer { buffers, bytes_consumed: 0, }, }) } /// Reads an object from the descriptor chain buffer. pub fn read_obj(&mut self) -> io::Result { let mut obj = MaybeUninit::::uninit(); // Safe because `MaybeUninit` guarantees that the pointer is valid for // `size_of::()` bytes. let buf = unsafe { ::std::slice::from_raw_parts_mut(obj.as_mut_ptr() as *mut u8, size_of::()) }; self.read_exact(buf)?; // Safe because any type that implements `ByteValued` can be considered initialized // even if it is filled with random data. Ok(unsafe { obj.assume_init() }) } /// Reads data from the descriptor chain buffer into a File at offset `off`. /// Returns the number of bytes read from the descriptor chain buffer. /// The number of bytes read can be less than `count` if there isn't /// enough data in the descriptor chain buffer. pub fn read_to_at>( &mut self, dst: F, count: usize, off: u64, flags: Option, ) -> io::Result { self.buffer.consume(count, |bufs| { dst.write_vectored_at_volatile(bufs, off, flags) }) } /// Returns number of bytes available for reading. May return an error if the combined /// lengths of all the buffers in the DescriptorChain would cause an integer overflow. pub fn available_bytes(&self) -> usize { self.buffer.available_bytes() } /// Returns number of bytes already read from the descriptor chain buffer. pub fn bytes_read(&self) -> usize { self.buffer.bytes_consumed() } /// Splits this `Reader` into two at the given offset in the `DescriptorChain` buffer. /// After the split, `self` will be able to read up to `offset` bytes while the returned /// `Reader` can read up to `available_bytes() - offset` bytes. Returns an error if /// `offset > self.available_bytes()`. pub fn split_at(&mut self, offset: usize) -> Result> { self.buffer.split_at(offset).map(|buffer| Reader { buffer }) } } impl<'a, B: BitmapSlice> io::Read for Reader<'a, B> { fn read(&mut self, buf: &mut [u8]) -> io::Result { self.buffer.consume(buf.len(), |bufs| { let mut rem = buf; let mut total = 0; for vs in bufs { let copy_len = cmp::min(rem.len(), vs.len()); // SAFETY: Safe because we verify that we do not read outside // of the slice's bound. The slice guard will only get dropped // after the function returns. This will keep the pointer valid // while reads are happening. unsafe { copy_nonoverlapping(vs.ptr_guard().as_ptr(), rem.as_mut_ptr(), copy_len); } rem = &mut rem[copy_len..]; total += copy_len; } Ok(total) }) } } /// Provides high-level interface over the sequence of memory regions /// defined by writable descriptors in the descriptor chain. /// /// Note that virtio spec requires driver to place any device-writable /// descriptors after any device-readable descriptors (2.6.4.2 in Virtio Spec v1.1). /// Writer will start iterating the descriptors from the first writable one and will /// assume that all following descriptors are writable. #[derive(Clone)] pub struct Writer<'a, B = ()> { buffer: DescriptorChainConsumer<'a, B>, } impl<'a, B: Bitmap + BitmapSlice + 'static> Writer<'a, B> { /// Construct a new Writer wrapper over `desc_chain`. pub fn new( mem: &'a GuestMemoryMmap, desc_chain: DescriptorChain, ) -> Result> where M: Deref, M::Target: GuestMemory + Sized, { let mut total_len: usize = 0; let buffers = desc_chain .writable() .map(|desc| { // Verify that summing the descriptor sizes does not overflow. // This can happen if a driver tricks a device into writing more data than // fits in a `usize`. total_len = total_len .checked_add(desc.len() as usize) .ok_or(Error::DescriptorChainOverflow)?; let region = mem .find_region(desc.addr()) .ok_or(Error::FindMemoryRegion)?; let offset = desc .addr() .checked_sub(region.start_addr().raw_value()) .unwrap(); region .deref() .get_slice(offset.raw_value() as usize, desc.len() as usize) .map_err(Error::VolatileMemoryError) }) .collect::>>>()?; Ok(Writer { buffer: DescriptorChainConsumer { buffers, bytes_consumed: 0, }, }) } /// Writes an object to the descriptor chain buffer. pub fn write_obj(&mut self, val: T) -> io::Result<()> { self.write_all(val.as_slice()) } /// Returns number of bytes available for writing. May return an error if the combined /// lengths of all the buffers in the DescriptorChain would cause an overflow. pub fn available_bytes(&self) -> usize { self.buffer.available_bytes() } /// Writes data to the descriptor chain buffer from a File at offset `off`. /// Returns the number of bytes written to the descriptor chain buffer. /// The number of bytes written can be less than `count` if /// there isn't enough data in the descriptor chain buffer. pub fn write_from_at>( &mut self, src: F, count: usize, off: u64, ) -> io::Result { self.buffer .consume(count, |bufs| src.read_vectored_at_volatile(bufs, off)) } /// Returns number of bytes already written to the descriptor chain buffer. pub fn bytes_written(&self) -> usize { self.buffer.bytes_consumed() } /// Splits this `Writer` into two at the given offset in the `DescriptorChain` buffer. /// After the split, `self` will be able to write up to `offset` bytes while the returned /// `Writer` can write up to `available_bytes() - offset` bytes. Returns an error if /// `offset > self.available_bytes()`. pub fn split_at(&mut self, offset: usize) -> Result> { self.buffer.split_at(offset).map(|buffer| Writer { buffer }) } } impl<'a, B: BitmapSlice> io::Write for Writer<'a, B> { fn write(&mut self, buf: &[u8]) -> io::Result { self.buffer.consume(buf.len(), |bufs| { let mut rem = buf; let mut total = 0; for vs in bufs { let copy_len = cmp::min(rem.len(), vs.len()); // SAFETY: Safe because we ensure that we do not write over the // slice's bounds. The slice guard will only get dropped after // the function returns. This will keep the pointer valid while // writes are happening. unsafe { copy_nonoverlapping(rem.as_ptr(), vs.ptr_guard_mut().as_ptr(), copy_len); } vs.bitmap().mark_dirty(0, copy_len); rem = &rem[copy_len..]; total += copy_len; } Ok(total) }) } fn flush(&mut self) -> io::Result<()> { // Nothing to flush since the writes go straight into the buffer. Ok(()) } } #[derive(Copy, Clone, PartialEq, Eq)] pub enum DescriptorType { Readable, Writable, } #[derive(Copy, Clone, Debug, Default)] #[repr(C)] struct virtq_desc { addr: Le64, len: Le32, flags: Le16, next: Le16, } // Safe because it only has data and has no implicit padding. unsafe impl ByteValued for virtq_desc {} #[derive(Copy, Clone, Debug, Default)] #[repr(C)] struct virtq_avail { flags: Le16, idx: Le16, ring: Le16, } // Safe because it only has data and has no implicit padding. unsafe impl ByteValued for virtq_avail {} #[cfg(test)] mod tests { use super::*; use virtio_queue::{Queue, QueueOwnedT, QueueT}; use vm_memory::{Bytes, GuestAddress}; const VIRTQ_DESC_F_NEXT: u16 = 0x1; const VIRTQ_DESC_F_WRITE: u16 = 0x2; const MAX_QUEUE_SIZE: u16 = 32768; /// Test utility function to create a descriptor chain in guest memory. pub fn create_descriptor_chain( memory: &GuestMemoryMmap, descriptor_array_addr: GuestAddress, mut buffers_start_addr: GuestAddress, descriptors: Vec<(DescriptorType, u32)>, spaces_between_regions: u32, ) -> Result> { let descriptors_len = descriptors.len(); for (index, (type_, size)) in descriptors.into_iter().enumerate() { let mut flags = 0; if let DescriptorType::Writable = type_ { flags |= VIRTQ_DESC_F_WRITE; } if index + 1 < descriptors_len { flags |= VIRTQ_DESC_F_NEXT; } let index = index as u16; let desc = virtq_desc { addr: buffers_start_addr.raw_value().into(), len: size.into(), flags: flags.into(), next: (index + 1).into(), }; let offset = size + spaces_between_regions; buffers_start_addr = buffers_start_addr .checked_add(u64::from(offset)) .ok_or(Error::InvalidChain)?; let _ = memory.write_obj( desc, descriptor_array_addr .checked_add(u64::from(index) * std::mem::size_of::() as u64) .ok_or(Error::InvalidChain)?, ); } let avail_ring = descriptor_array_addr .checked_add( u64::from(descriptors_len as u16) * std::mem::size_of::() as u64, ) .ok_or(Error::InvalidChain)?; let avail = virtq_avail { flags: 0.into(), idx: 1.into(), ring: 0.into(), }; let _ = memory.write_obj(avail, avail_ring); let mut queue: Queue = Queue::new(MAX_QUEUE_SIZE).unwrap(); queue .try_set_desc_table_address(descriptor_array_addr) .unwrap(); queue.try_set_avail_ring_address(avail_ring).unwrap(); queue.set_ready(true); let desc = queue.iter(memory).unwrap().next().unwrap(); Ok(desc.clone()) } #[test] fn reader_test_simple_chain() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![ (Readable, 8), (Readable, 16), (Readable, 18), (Readable, 64), ], 0, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain).expect("failed to create Reader"); assert_eq!(reader.available_bytes(), 106); assert_eq!(reader.bytes_read(), 0); let mut buffer = [0 as u8; 64]; if let Err(e) = reader.read_exact(&mut buffer) { panic!("read_exact should not fail here: {:?}", e); } assert_eq!(reader.available_bytes(), 42); assert_eq!(reader.bytes_read(), 64); match reader.read(&mut buffer) { Err(e) => panic!("read should not fail here: {:?}", e), Ok(length) => assert_eq!(length, 42), } assert_eq!(reader.available_bytes(), 0); assert_eq!(reader.bytes_read(), 106); } #[test] fn writer_test_simple_chain() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![ (Writable, 8), (Writable, 16), (Writable, 18), (Writable, 64), ], 0, ) .expect("create_descriptor_chain failed"); let mut writer = Writer::new(&memory, chain).expect("failed to create Writer"); assert_eq!(writer.available_bytes(), 106); assert_eq!(writer.bytes_written(), 0); let buffer = [0 as u8; 64]; if let Err(e) = writer.write_all(&buffer) { panic!("write_all should not fail here: {:?}", e); } assert_eq!(writer.available_bytes(), 42); assert_eq!(writer.bytes_written(), 64); match writer.write(&buffer) { Err(e) => panic!("write should not fail here {:?}", e), Ok(length) => assert_eq!(length, 42), } assert_eq!(writer.available_bytes(), 0); assert_eq!(writer.bytes_written(), 106); } #[test] fn reader_test_incompatible_chain() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![(Writable, 8)], 0, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain).expect("failed to create Reader"); assert_eq!(reader.available_bytes(), 0); assert_eq!(reader.bytes_read(), 0); assert!(reader.read_obj::().is_err()); assert_eq!(reader.available_bytes(), 0); assert_eq!(reader.bytes_read(), 0); } #[test] fn writer_test_incompatible_chain() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![(Readable, 8)], 0, ) .expect("create_descriptor_chain failed"); let mut writer = Writer::new(&memory, chain).expect("failed to create Writer"); assert_eq!(writer.available_bytes(), 0); assert_eq!(writer.bytes_written(), 0); assert!(writer.write_obj(0u8).is_err()); assert_eq!(writer.available_bytes(), 0); assert_eq!(writer.bytes_written(), 0); } #[test] fn reader_writer_shared_chain() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![ (Readable, 16), (Readable, 16), (Readable, 96), (Writable, 64), (Writable, 1), (Writable, 3), ], 0, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain.clone()).expect("failed to create Reader"); let mut writer = Writer::new(&memory, chain).expect("failed to create Writer"); assert_eq!(reader.bytes_read(), 0); assert_eq!(writer.bytes_written(), 0); let mut buffer = Vec::with_capacity(200); assert_eq!( reader .read_to_end(&mut buffer) .expect("read should not fail here"), 128 ); // The writable descriptors are only 68 bytes long. writer .write_all(&buffer[..68]) .expect("write should not fail here"); assert_eq!(reader.available_bytes(), 0); assert_eq!(reader.bytes_read(), 128); assert_eq!(writer.available_bytes(), 0); assert_eq!(writer.bytes_written(), 68); } #[test] fn reader_writer_shattered_object() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let secret: Le32 = 0x1234_5678.into(); // Create a descriptor chain with memory regions that are properly separated. let chain_writer = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![(Writable, 1), (Writable, 1), (Writable, 1), (Writable, 1)], 123, ) .expect("create_descriptor_chain failed"); let mut writer = Writer::new(&memory, chain_writer).expect("failed to create Writer"); if let Err(e) = writer.write_obj(secret) { panic!("write_obj should not fail here: {:?}", e); } // Now create new descriptor chain pointing to the same memory and try to read it. let chain_reader = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![(Readable, 1), (Readable, 1), (Readable, 1), (Readable, 1)], 123, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain_reader).expect("failed to create Reader"); match reader.read_obj::() { Err(e) => panic!("read_obj should not fail here: {:?}", e), Ok(read_secret) => assert_eq!(read_secret, secret), } } #[test] fn reader_unexpected_eof() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![(Readable, 256), (Readable, 256)], 0, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain).expect("failed to create Reader"); let mut buf = vec![0; 1024]; assert_eq!( reader .read_exact(&mut buf[..]) .expect_err("read more bytes than available") .kind(), io::ErrorKind::UnexpectedEof ); } #[test] fn split_border() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![ (Readable, 16), (Readable, 16), (Readable, 96), (Writable, 64), (Writable, 1), (Writable, 3), ], 0, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain).expect("failed to create Reader"); let other = reader.split_at(32).expect("failed to split Reader"); assert_eq!(reader.available_bytes(), 32); assert_eq!(other.available_bytes(), 96); } #[test] fn split_middle() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![ (Readable, 16), (Readable, 16), (Readable, 96), (Writable, 64), (Writable, 1), (Writable, 3), ], 0, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain).expect("failed to create Reader"); let other = reader.split_at(24).expect("failed to split Reader"); assert_eq!(reader.available_bytes(), 24); assert_eq!(other.available_bytes(), 104); } #[test] fn split_end() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![ (Readable, 16), (Readable, 16), (Readable, 96), (Writable, 64), (Writable, 1), (Writable, 3), ], 0, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain).expect("failed to create Reader"); let other = reader.split_at(128).expect("failed to split Reader"); assert_eq!(reader.available_bytes(), 128); assert_eq!(other.available_bytes(), 0); } #[test] fn split_beginning() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![ (Readable, 16), (Readable, 16), (Readable, 96), (Writable, 64), (Writable, 1), (Writable, 3), ], 0, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain).expect("failed to create Reader"); let other = reader.split_at(0).expect("failed to split Reader"); assert_eq!(reader.available_bytes(), 0); assert_eq!(other.available_bytes(), 128); } #[test] fn split_outofbounds() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![ (Readable, 16), (Readable, 16), (Readable, 96), (Writable, 64), (Writable, 1), (Writable, 3), ], 0, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain).expect("failed to create Reader"); if reader.split_at(256).is_ok() { panic!("successfully split Reader with out of bounds offset"); } } #[test] fn read_full() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![(Readable, 16), (Readable, 16), (Readable, 16)], 0, ) .expect("create_descriptor_chain failed"); let mut reader = Reader::new(&memory, chain).expect("failed to create Reader"); let mut buf = vec![0u8; 64]; assert_eq!( reader.read(&mut buf[..]).expect("failed to read to buffer"), 48 ); } #[test] fn write_full() { use DescriptorType::*; let memory_start_addr = GuestAddress(0x0); let memory = GuestMemoryMmap::from_ranges(&[(memory_start_addr, 0x10000)]).unwrap(); let chain = create_descriptor_chain( &memory, GuestAddress(0x0), GuestAddress(0x100), vec![(Writable, 16), (Writable, 16), (Writable, 16)], 0, ) .expect("create_descriptor_chain failed"); let mut writer = Writer::new(&memory, chain).expect("failed to create Writer"); let buf = vec![0xdeu8; 64]; assert_eq!( writer.write(&buf[..]).expect("failed to write from buffer"), 48 ); } } virtiofsd-1.10.0/src/file_traits.rs000064400000000000000000000131271046102023000153620ustar 00000000000000// Copyright 2018 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use std::convert::TryInto; use std::fs::File; use std::io::{Error, Result}; use std::os::unix::io::{AsFd, AsRawFd}; use vm_memory::VolatileSlice; use crate::oslib; use libc::{c_int, c_void, off64_t, preadv64, size_t}; use vm_memory::bitmap::BitmapSlice; /// A trait for setting the size of a file. /// This is equivalent to File's `set_len` method, but /// wrapped in a trait so that it can be implemented for /// other types. pub trait FileSetLen { // Set the size of this file. // This is the moral equivalent of `ftruncate()`. fn set_len(&self, _len: u64) -> Result<()>; } impl FileSetLen for File { fn set_len(&self, len: u64) -> Result<()> { File::set_len(self, len) } } /// A trait similar to the unix `ReadExt` and `WriteExt` traits, but for volatile memory. pub trait FileReadWriteAtVolatile { /// Reads bytes from this file at `offset` into the given slice of buffers, returning the number /// of bytes read on success. Data is copied to fill each buffer in order, with the final buffer /// written to possibly being only partially filled. fn read_vectored_at_volatile(&self, bufs: &[&VolatileSlice], offset: u64) -> Result; /// Writes bytes to this file at `offset` from the given slice of buffers, returning the number /// of bytes written on success. Data is copied from each buffer in order, with the final buffer /// read from possibly being only partially consumed. fn write_vectored_at_volatile( &self, bufs: &[&VolatileSlice], offset: u64, flags: Option, ) -> Result; } impl<'a, B: BitmapSlice, T: FileReadWriteAtVolatile + ?Sized> FileReadWriteAtVolatile for &'a T { fn read_vectored_at_volatile(&self, bufs: &[&VolatileSlice], offset: u64) -> Result { (**self).read_vectored_at_volatile(bufs, offset) } fn write_vectored_at_volatile( &self, bufs: &[&VolatileSlice], offset: u64, flags: Option, ) -> Result { (**self).write_vectored_at_volatile(bufs, offset, flags) } } macro_rules! volatile_impl { ($ty:ty) => { impl FileReadWriteAtVolatile for $ty { fn read_vectored_at_volatile( &self, bufs: &[&VolatileSlice], offset: u64, ) -> Result { let slice_guards: Vec<_> = bufs.iter().map(|s| s.ptr_guard_mut()).collect(); let iovecs: Vec = slice_guards .iter() .map(|s| libc::iovec { iov_base: s.as_ptr() as *mut c_void, iov_len: s.len() as size_t, }) .collect(); if iovecs.is_empty() { return Ok(0); } // SAFETY: Safe because only bytes inside the buffers are // accessed and the kernel is expected to handle arbitrary // memory for I/O. The pointers into the slice are valid since // the slice_guards are still in scope. let ret = unsafe { preadv64( self.as_raw_fd(), &iovecs[0], iovecs.len() as c_int, offset as off64_t, ) }; if ret >= 0 { let mut total = 0; for vs in bufs { // Each `VolatileSlice` has a "local" bitmap (i.e., the offset 0 in the // bitmap corresponds to the beginning of the `VolatileSlice`) vs.bitmap() .mark_dirty(0, std::cmp::min(ret as usize - total, vs.len())); total += vs.len(); if total >= ret as usize { break; } } Ok(ret as usize) } else { Err(Error::last_os_error()) } } fn write_vectored_at_volatile( &self, bufs: &[&VolatileSlice], offset: u64, flags: Option, ) -> Result { let slice_guards: Vec<_> = bufs.iter().map(|s| s.ptr_guard()).collect(); let iovecs: Vec = slice_guards .iter() .map(|s| libc::iovec { iov_base: s.as_ptr() as *mut c_void, iov_len: s.len() as size_t, }) .collect(); if iovecs.is_empty() { return Ok(0); } // SAFETY: Each `libc::iovec` element is created from a // `VolatileSlice` of the guest memory. The pointers are valid // because the slice guards are still in scope. We also ensure // that we do not read over the slice bounds. unsafe { oslib::writev_at( self.as_fd(), iovecs.as_slice(), offset.try_into().unwrap(), flags, ) } } } }; } volatile_impl!(File); virtiofsd-1.10.0/src/filesystem.rs000064400000000000000000001417001046102023000152400ustar 00000000000000// Copyright 2019 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use std::convert::TryInto; use std::ffi::{CStr, CString}; use std::fs::File; use std::time::Duration; use std::{io, mem}; use crate::{fuse, oslib}; use super::fs_cache_req_handler::FsCacheReqHandler; pub use fuse::{FsOptions, OpenOptions, RemovemappingOne, SetattrValid, SetxattrFlags, ROOT_ID}; /// Information about a path in the filesystem. pub struct Entry { /// An `Inode` that uniquely identifies this path. During `lookup`, setting this to `0` means a /// negative entry. Returning `ENOENT` also means a negative entry but setting this to `0` /// allows the kernel to cache the negative result for `entry_timeout`. The value should be /// produced by converting a `FileSystem::Inode` into a `u64`. pub inode: u64, /// The generation number for this `Entry`. Typically used for network file systems. An `inode` /// / `generation` pair must be unique over the lifetime of the file system (rather than just /// the lifetime of the mount). In other words, if a `FileSystem` implementation re-uses an /// `Inode` after it has been deleted then it must assign a new, previously unused generation /// number to the `Inode` at the same time. pub generation: u64, /// Inode attributes. Even if `attr_timeout` is zero, `attr` must be correct. For example, for /// `open()`, FUSE uses `attr.st_size` from `lookup()` to determine how many bytes to request. /// If this value is not correct, incorrect data will be returned. pub attr: libc::stat64, /// Flags for `fuse::Attr.flags`. pub attr_flags: u32, /// How long the values in `attr` should be considered valid. If the attributes of the `Entry` /// are only modified by the FUSE client, then this should be set to a very large value. pub attr_timeout: Duration, /// How long the name associated with this `Entry` should be considered valid. If directory /// entries are only changed or deleted by the FUSE client, then this should be set to a very /// large value. pub entry_timeout: Duration, } impl From for fuse::EntryOut { fn from(entry: Entry) -> fuse::EntryOut { fuse::EntryOut { nodeid: entry.inode, generation: entry.generation, entry_valid: entry.entry_timeout.as_secs(), attr_valid: entry.attr_timeout.as_secs(), entry_valid_nsec: entry.entry_timeout.subsec_nanos(), attr_valid_nsec: entry.attr_timeout.subsec_nanos(), attr: fuse::Attr::with_flags(entry.attr, entry.attr_flags), } } } /// Represents information about an entry in a directory. pub struct DirEntry<'a> { /// The inode number for this entry. This does NOT have to be the same as the `Inode` for this /// directory entry. However, it must be the same as the `attr.st_ino` field of the `Entry` that /// would be returned by a `lookup` request in the parent directory for `name`. pub ino: libc::ino64_t, /// Any non-zero value that the kernel can use to identify the current point in the directory /// entry stream. It does not need to be the actual physical position. A value of `0` is /// reserved to mean "from the beginning" and should never be used. The `offset` value of the /// first entry in a stream should point to the beginning of the second entry and so on. pub offset: u64, /// The type of this directory entry. Valid values are any of the `libc::DT_*` constants. pub type_: u32, /// The name of this directory entry. There are no requirements for the contents of this field /// and any sequence of bytes is considered valid. pub name: &'a CStr, } /// A reply to a `getxattr` method call. pub enum GetxattrReply { /// The value of the requested extended attribute. This can be arbitrary textual or binary data /// and does not need to be nul-terminated. Value(Vec), /// The size of the buffer needed to hold the value of the requested extended attribute. Should /// be returned when the `size` parameter is 0. Callers should note that it is still possible /// for the size of the value to change in between `getxattr` calls and should not assume that a /// subsequent call to `getxattr` with the returned count will always succeed. Count(u32), } /// A reply to a `listxattr` method call. pub enum ListxattrReply { /// A buffer containing a nul-separated list of the names of all the extended attributes /// associated with this `Inode`. This list of names may be unordered and includes a namespace /// prefix. There may be several disjoint namespaces associated with a single `Inode`. Names(Vec), /// This size of the buffer needed to hold the full list of extended attribute names associated /// with this `Inode`. Should be returned when the `size` parameter is 0. Callers should note /// that it is still possible for the set of extended attributes to change between `listxattr` /// calls and so should not assume that a subsequent call to `listxattr` with the returned count /// will always succeed. Count(u32), } /// A trait for directly copying data from the fuse transport into a `File` without first storing it /// in an intermediate buffer. pub trait ZeroCopyReader { /// Copies at most `count` bytes from `self` directly into `f` at offset `off` without storing /// it in any intermediate buffers. If the return value is `Ok(n)` then it must be guaranteed /// that `0 <= n <= count`. If `n` is `0`, then it can indicate one of 3 possibilities: /// /// 1. There is no more data left in `self`. /// 2. There is no more space in `f`. /// 3. `count` was `0`. /// /// # Errors /// /// If any error is returned then the implementation must guarantee that no bytes were copied /// from `self`. If the underlying write to `f` returns `0` then the implementation must return /// an error of the kind `io::ErrorKind::WriteZero`. fn read_to( &mut self, f: &File, count: usize, off: u64, flags: Option, ) -> io::Result; /// Copies exactly `count` bytes of data from `self` into `f` at offset `off`. `off + count` /// must be less than `u64::MAX`. /// /// # Errors /// /// If an error is returned then the number of bytes copied from `self` is unspecified but it /// will never be more than `count`. fn read_exact_to( &mut self, f: &mut File, mut count: usize, mut off: u64, flags: Option, ) -> io::Result<()> { let c = count .try_into() .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; if off.checked_add(c).is_none() { return Err(io::Error::new( io::ErrorKind::InvalidInput, "`off` + `count` must be less than u64::MAX", )); } while count > 0 { match self.read_to(f, count, off, flags) { Ok(0) => { return Err(io::Error::new( io::ErrorKind::WriteZero, "failed to fill whole buffer", )) } Ok(n) => { count -= n; off += n as u64; } Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} Err(e) => return Err(e), } } Ok(()) } /// Copies all remaining bytes from `self` into `f` at offset `off`. Equivalent to repeatedly /// calling `read_to` until it returns either `Ok(0)` or a non-`ErrorKind::Interrupted` error. /// /// # Errors /// /// If an error is returned then the number of bytes copied from `self` is unspecified. fn copy_to_end( &mut self, f: &mut File, mut off: u64, flags: Option, ) -> io::Result { let mut out = 0; loop { match self.read_to(f, usize::MAX, off, flags) { Ok(0) => return Ok(out), Ok(n) => { off = off.saturating_add(n as u64); out += n; } Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} Err(e) => return Err(e), } } } } impl<'a, R: ZeroCopyReader> ZeroCopyReader for &'a mut R { fn read_to( &mut self, f: &File, count: usize, off: u64, flags: Option, ) -> io::Result { (**self).read_to(f, count, off, flags) } fn read_exact_to( &mut self, f: &mut File, count: usize, off: u64, flags: Option, ) -> io::Result<()> { (**self).read_exact_to(f, count, off, flags) } fn copy_to_end( &mut self, f: &mut File, off: u64, flags: Option, ) -> io::Result { (**self).copy_to_end(f, off, flags) } } /// A trait for directly copying data from a `File` into the fuse transport without first storing /// it in an intermediate buffer. pub trait ZeroCopyWriter { /// Copies at most `count` bytes from `f` at offset `off` directly into `self` without storing /// it in any intermediate buffers. If the return value is `Ok(n)` then it must be guaranteed /// that `0 <= n <= count`. If `n` is `0`, then it can indicate one of 3 possibilities: /// /// 1. There is no more data left in `f`. /// 2. There is no more space in `self`. /// 3. `count` was `0`. /// /// # Errors /// /// If any error is returned then the implementation must guarantee that no bytes were copied /// from `f`. If the underlying read from `f` returns `0` then the implementation must return an /// error of the kind `io::ErrorKind::UnexpectedEof`. fn write_from(&mut self, f: &File, count: usize, off: u64) -> io::Result; /// Copies exactly `count` bytes of data from `f` at offset `off` into `self`. `off + count` /// must be less than `u64::MAX`. /// /// # Errors /// /// If an error is returned then the number of bytes copied from `self` is unspecified but it /// well never be more than `count`. fn write_all_from(&mut self, f: &mut File, mut count: usize, mut off: u64) -> io::Result<()> { let c = count .try_into() .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; if off.checked_add(c).is_none() { return Err(io::Error::new( io::ErrorKind::InvalidInput, "`off` + `count` must be less than u64::MAX", )); } while count > 0 { match self.write_from(f, count, off) { Ok(0) => { return Err(io::Error::new( io::ErrorKind::UnexpectedEof, "failed to write whole buffer", )) } Ok(n) => { // No need for checked math here because we verified that `off + count` will not // overflow and `n` must be <= `count`. count -= n; off += n as u64; } Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} Err(e) => return Err(e), } } Ok(()) } /// Copies all remaining bytes from `f` at offset `off` into `self`. Equivalent to repeatedly /// calling `write_from` until it returns either `Ok(0)` or a non-`ErrorKind::Interrupted` /// error. /// /// # Errors /// /// If an error is returned then the number of bytes copied from `f` is unspecified. fn copy_to_end(&mut self, f: &mut File, mut off: u64) -> io::Result { let mut out = 0; loop { match self.write_from(f, usize::MAX, off) { Ok(0) => return Ok(out), Ok(n) => { off = off.saturating_add(n as u64); out += n; } Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} Err(e) => return Err(e), } } } } impl<'a, W: ZeroCopyWriter> ZeroCopyWriter for &'a mut W { fn write_from(&mut self, f: &File, count: usize, off: u64) -> io::Result { (**self).write_from(f, count, off) } fn write_all_from(&mut self, f: &mut File, count: usize, off: u64) -> io::Result<()> { (**self).write_all_from(f, count, off) } fn copy_to_end(&mut self, f: &mut File, off: u64) -> io::Result { (**self).copy_to_end(f, off) } } /// Additional context associated with requests. #[derive(Clone, Copy, Debug)] pub struct Context { /// The user ID of the calling process. pub uid: libc::uid_t, /// The group ID of the calling process. pub gid: libc::gid_t, /// The thread group ID of the calling process. pub pid: libc::pid_t, } impl From for Context { fn from(source: fuse::InHeader) -> Self { Context { uid: source.uid, gid: source.gid, pid: source.pid as i32, } } } /// Request extensions #[derive(Clone, Default, Debug)] pub struct Extensions { pub secctx: Option, pub sup_gid: Option, } /// Additional security context associated with requests. #[derive(Clone, Debug, Default)] pub struct SecContext { /// Name of security context pub name: CString, /// Actual security context pub secctx: Vec, } /// A trait for iterating over the contents of a directory. This trait is needed because rust /// doesn't support generic associated types, which means that it's not possible to implement a /// regular iterator that yields a `DirEntry` due to its generic lifetime parameter. pub trait DirectoryIterator { /// Returns the next entry in the directory or `None` if there are no more. fn next(&mut self) -> Option; } /// The main trait that connects a file system with a transport. #[allow(unused_variables)] pub trait FileSystem { /// Represents a location in the filesystem tree and can be used to perform operations that act /// on the metadata of a file/directory (e.g., `getattr` and `setattr`). Can also be used as the /// starting point for looking up paths in the filesystem tree. An `Inode` may support operating /// directly on the content of the path that to which it points. `FileSystem` implementations /// that support this should set the `FsOptions::ZERO_MESSAGE_OPEN` option in the return value /// of the `init` function. On linux based systems, an `Inode` is equivalent to opening a file /// or directory with the `libc::O_PATH` flag. /// /// # Lookup Count /// /// The `FileSystem` implementation is required to keep a "lookup count" for every `Inode`. /// Every time an `Entry` is returned by a `FileSystem` trait method, this lookup count should /// increase by 1. The lookup count for an `Inode` decreases when the kernel sends a `forget` /// request. `Inode`s with a non-zero lookup count may receive requests from the kernel even /// after calls to `unlink`, `rmdir` or (when overwriting an existing file) `rename`. /// `FileSystem` implementations must handle such requests properly and it is recommended to /// defer removal of the `Inode` until the lookup count reaches zero. Calls to `unlink`, `rmdir` /// or `rename` will be followed closely by `forget` unless the file or directory is open, in /// which case the kernel issues `forget` only after the `release` or `releasedir` calls. /// /// Note that if a file system will be exported over NFS the `Inode`'s lifetime must extend even /// beyond `forget`. See the `generation` field in `Entry`. type Inode: From + Into; /// Represents a file or directory that is open for reading/writing. type Handle: From + Into; /// An iterator over the entries of a directory. See the documentation for `readdir` for more /// details. type DirIter: DirectoryIterator; /// Initialize the file system. /// /// This method is called when a connection to the FUSE kernel module is first established. The /// `capable` parameter indicates the features that are supported by the kernel module. The /// implementation should return the options that it supports. Any options set in the returned /// `FsOptions` that are not also set in `capable` are silently dropped. fn init(&self, capable: FsOptions) -> io::Result { Ok(FsOptions::empty()) } /// Clean up the file system. /// /// Called when the filesystem exits. All open `Handle`s should be closed and the lookup count /// for all open `Inode`s implicitly goes to zero. At this point the connection to the FUSE /// kernel module may already be gone so implementations should not rely on being able to /// communicate with the kernel. fn destroy(&self) {} /// Look up a directory entry by name and get its attributes. /// /// If this call is successful then the lookup count of the `Inode` associated with the returned /// `Entry` must be increased by 1. fn lookup(&self, ctx: Context, parent: Self::Inode, name: &CStr) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Forget about an inode. /// /// Called when the kernel removes an inode from its internal caches. `count` indicates the /// amount by which the lookup count for the inode should be decreased. If reducing the lookup /// count by `count` causes it to go to zero, then the implementation may delete the `Inode`. fn forget(&self, ctx: Context, inode: Self::Inode, count: u64) {} /// Forget about multiple inodes. /// /// `requests` is a vector of `(inode, count)` pairs. See the documentation for `forget` for /// more information. fn batch_forget(&self, ctx: Context, requests: Vec<(Self::Inode, u64)>) { for (inode, count) in requests { self.forget(ctx, inode, count) } } /// Get attributes for a file / directory. /// /// If `handle` is not `None`, then it contains the handle previously returned by the /// implementation after a call to `open` or `opendir`. However, implementations should still /// take care to verify the handle if they do not trust the client (e.g., virtio-fs). /// /// If writeback caching is enabled (`FsOptions::WRITEBACK_CACHE`), then the kernel module /// likely has a better idea of the length of the file than the file system (for /// example, if there was a write that extended the size of the file but has not yet been /// flushed). In this case, the `st_size` field of the returned struct is ignored. /// /// The returned `Duration` indicates how long the returned attributes should be considered /// valid by the client. If the attributes are only changed via the FUSE kernel module (i.e., /// the kernel module has exclusive access), then this should be a very large value. fn getattr( &self, ctx: Context, inode: Self::Inode, handle: Option, ) -> io::Result<(libc::stat64, Duration)> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Set attributes for a file / directory. /// /// If `handle` is not `None`, then it contains the handle previously returned by the /// implementation after a call to `open` or `opendir`. However, implementations should still /// take care to verify the handle if they do not trust the client (e.g., virtio-fs). /// /// The `valid` parameter indicates the fields of `attr` that may be considered valid and should /// be set by the file system. The content of all other fields in `attr` is undefined. /// /// If the `FsOptions::HANDLE_KILLPRIV_V2` was set during `init`, then the implementation is /// expected to reset the setuid and setgid bits if the file size or owner is being changed. /// /// This method returns the new attributes after making the modifications requested by the /// client. The returned `Duration` indicates how long the returned attributes should be /// considered valid by the client. If the attributes are only changed via the FUSE kernel /// module (i.e., the kernel module has exclusive access), then this should be a very large /// value. fn setattr( &self, ctx: Context, inode: Self::Inode, attr: libc::stat64, handle: Option, valid: SetattrValid, ) -> io::Result<(libc::stat64, Duration)> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Read a symbolic link. fn readlink(&self, ctx: Context, inode: Self::Inode) -> io::Result> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Create a symbolic link. /// /// The file system must create a symbolic link named `name` in the directory represented by /// `parent`, which contains the string `linkname`. Returns an `Entry` for the newly created /// symlink. /// /// If this call is successful then the lookup count of the `Inode` associated with the returned /// `Entry` must be increased by 1. fn symlink( &self, ctx: Context, linkname: &CStr, parent: Self::Inode, name: &CStr, extensions: Extensions, ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Create a file node. /// /// Create a regular file, character device, block device, fifo, or socket node named `name` in /// the directory represented by `inode`. Valid values for `mode` and `rdev` are the same as /// those accepted by the `mknod(2)` system call. Returns an `Entry` for the newly created node. /// /// When the `FsOptions::DONT_MASK` feature is set, the file system is responsible for setting /// the permissions of the created node to `mode & !umask`. /// /// If this call is successful then the lookup count of the `Inode` associated with the returned /// `Entry` must be increased by 1. #[allow(clippy::too_many_arguments)] fn mknod( &self, ctx: Context, inode: Self::Inode, name: &CStr, mode: u32, rdev: u32, umask: u32, extensions: Extensions, ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Create a directory. /// /// When the `FsOptions::DONT_MASK` feature is set, the file system is responsible for setting /// the permissions of the created directory to `mode & !umask`. Returns an `Entry` for the /// newly created directory. /// /// If this call is successful then the lookup count of the `Inode` associated with the returned /// `Entry` must be increased by 1. fn mkdir( &self, ctx: Context, parent: Self::Inode, name: &CStr, mode: u32, umask: u32, extensions: Extensions, ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Remove a file. /// /// If the file's inode lookup count is non-zero, then the file system is expected to delay /// removal of the inode until the lookup count goes to zero. See the documentation of the /// `forget` function for more information. fn unlink(&self, ctx: Context, parent: Self::Inode, name: &CStr) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Remove a directory. /// /// If the directory's inode lookup count is non-zero, then the file system is expected to delay /// removal of the inode until the lookup count goes to zero. See the documentation of the /// `forget` function for more information. fn rmdir(&self, ctx: Context, parent: Self::Inode, name: &CStr) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Rename a file / directory. /// /// If the destination exists, it should be atomically replaced. If the destination's inode /// lookup count is non-zero, then the file system is expected to delay removal of the inode /// until the lookup count goes to zero. See the documentation of the `forget` function for more /// information. /// /// `flags` may be `libc::RENAME_EXCHANGE` or `libc::RENAME_NOREPLACE`. If /// `libc::RENAME_NOREPLACE` is specified, the implementation must not overwrite `newname` if it /// exists and must return an error instead. If `libc::RENAME_EXCHANGE` is specified, the /// implementation must atomically exchange the two files, i.e., both must exist and neither may /// be deleted. fn rename( &self, ctx: Context, olddir: Self::Inode, oldname: &CStr, newdir: Self::Inode, newname: &CStr, flags: u32, ) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Create a hard link. /// /// Create a hard link from `inode` to `newname` in the directory represented by `newparent`. /// /// If this call is successful then the lookup count of the `Inode` associated with the returned /// `Entry` must be increased by 1. fn link( &self, ctx: Context, inode: Self::Inode, newparent: Self::Inode, newname: &CStr, ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Open a file. /// /// Open the file associated with `inode` for reading / writing. All values accepted by the /// `open(2)` system call are valid values for `flags` and must be handled by the file system. /// However, there are some additional rules: /// /// * Creation flags (`libc::O_CREAT`, `libc::O_EXCL`, `libc::O_NOCTTY`) will be filtered out /// and handled by the kernel. /// /// * The file system should check the access modes (`libc::O_RDONLY`, `libc::O_WRONLY`, /// `libc::O_RDWR`) to determine if the operation is permitted. If the file system was mounted /// with the `-o default_permissions` mount option, then this check will also be carried out /// by the kernel before sending the open request. /// /// * When writeback caching is enabled (`FsOptions::WRITEBACK_CACHE`) the kernel may send read /// requests even for files opened with `libc::O_WRONLY`. The file system should be prepared /// to handle this. /// /// * When writeback caching is enabled, the kernel will handle the `libc::O_APPEND` flag. /// However, this will not work reliably unless the kernel has exclusive access to the file. /// In this case the file system may either ignore the `libc::O_APPEND` flag or return an /// error to indicate that reliable `libc::O_APPEND` handling is not available. /// /// * When writeback caching is disabled, the file system is expected to properly handle /// `libc::O_APPEND` and ensure that each write is appended to the end of the file. /// /// The file system may choose to return a `Handle` to refer to the newly opened file. The /// kernel will then use this `Handle` for all operations on the content of the file (`read`, /// `write`, `flush`, `release`, `fsync`). If the file system does not return a /// `Handle` then the kernel will use the `Inode` for the file to operate on its contents. In /// this case the file system may wish to enable the `FsOptions::ZERO_MESSAGE_OPEN` feature if /// it is supported by the kernel (see below). /// /// The returned `OpenOptions` allow the file system to change the way the opened file is /// handled by the kernel. See the documentation of `OpenOptions` for more information. /// /// If `kill_priv` is true then it indicates that the file system is expected to clear the /// setuid and setgid bits. /// /// If the `FsOptions::ZERO_MESSAGE_OPEN` feature is enabled by both the file system /// implementation and the kernel, then the file system may return an error of `ENOSYS`. This /// will be interpreted by the kernel as success and future calls to `open` and `release` will /// be handled by the kernel without being passed on to the file system. fn open( &self, ctx: Context, inode: Self::Inode, kill_priv: bool, flags: u32, ) -> io::Result<(Option, OpenOptions)> { // Matches the behavior of libfuse. Ok((None, OpenOptions::empty())) } /// Create and open a file. /// /// If the file does not already exist, the file system should create it with the specified /// `mode`. When the `FsOptions::DONT_MASK` feature is set, the file system is responsible for /// setting the permissions of the created file to `mode & !umask`. /// /// If `kill_priv` is true then it indicates that the file system is expected to clear the /// setuid and setgid bits. /// /// If the file system returns an `ENOSYS` error, then the kernel will treat this method as /// unimplemented and all future calls to `create` will be handled by calling the `mknod` and /// `open` methods instead. /// /// See the documentation for the `open` method for more information about opening the file. In /// addition to the optional `Handle` and the `OpenOptions`, the file system must also return an /// `Entry` for the file. This increases the lookup count for the `Inode` associated with the /// file by 1. #[allow(clippy::too_many_arguments)] fn create( &self, ctx: Context, parent: Self::Inode, name: &CStr, mode: u32, kill_priv: bool, flags: u32, umask: u32, extensions: Extensions, ) -> io::Result<(Entry, Option, OpenOptions)> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Read data from a file. /// /// Returns `size` bytes of data starting from offset `off` from the file associated with /// `inode` or `handle`. /// /// `flags` contains the flags used to open the file. Similarly, `handle` is the `Handle` /// returned by the file system from the `open` method, if any. If the file system /// implementation did not return a `Handle` from `open` then the contents of `handle` are /// undefined. /// /// This method should return exactly the number of bytes requested by the kernel, except in the /// case of error or EOF. Otherwise, the kernel will substitute the rest of the data with /// zeroes. An exception to this rule is if the file was opened with the "direct I/O" option /// (`libc::O_DIRECT`), in which case the kernel will forward the return code from this method /// to the userspace application that made the system call. #[allow(clippy::too_many_arguments)] fn read( &self, ctx: Context, inode: Self::Inode, handle: Self::Handle, w: W, size: u32, offset: u64, lock_owner: Option, flags: u32, ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Write data to a file. /// /// Writes `size` bytes of data starting from offset `off` to the file associated with `inode` /// or `handle`. /// /// `flags` contains the flags used to open the file. Similarly, `handle` is the `Handle` /// returned by the file system from the `open` method, if any. If the file system /// implementation did not return a `Handle` from `open` then the contents of `handle` are /// undefined. /// /// If `delayed_write` is true then it indicates that this is a write for buffered data. /// /// If `kill_priv` is true then it indicates that the file system is expected to clear the /// setuid and setgid bits. /// /// This method should return exactly the number of bytes requested by the kernel, except in the /// case of error. An exception to this rule is if the file was opened with the "direct I/O" /// option (`libc::O_DIRECT`), in which case the kernel will forward the return code from this /// method to the userspace application that made the system call. #[allow(clippy::too_many_arguments)] fn write( &self, ctx: Context, inode: Self::Inode, handle: Self::Handle, r: R, size: u32, offset: u64, lock_owner: Option, delayed_write: bool, kill_priv: bool, flags: u32, ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Flush the contents of a file. /// /// This method is called on every `close()` of a file descriptor. Since it is possible to /// duplicate file descriptors there may be many `flush` calls for one call to `open`. /// /// File systems should not make any assumptions about when `flush` will be /// called or even if it will be called at all. /// /// `handle` is the `Handle` returned by the file system from the `open` method, if any. If the /// file system did not return a `Handle` from `open` then the contents of `handle` are /// undefined. /// /// Unlike `fsync`, the file system is not required to flush pending writes. One reason to flush /// data is if the file system wants to return write errors during close. However, this is not /// portable because POSIX does not require `close` to wait for delayed I/O to complete. /// /// If the `FsOptions::POSIX_LOCKS` feature is enabled, then the file system must remove all /// locks belonging to `lock_owner`. /// /// If this method returns an `ENOSYS` error then the kernel will treat it as success and all /// subsequent calls to `flush` will be handled by the kernel without being forwarded to the /// file system. fn flush( &self, ctx: Context, inode: Self::Inode, handle: Self::Handle, lock_owner: u64, ) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Synchronize file contents. /// /// File systems must ensure that the file contents have been flushed to disk before returning /// from this method. If `datasync` is true then only the file data (but not the metadata) needs /// to be flushed. /// /// `handle` is the `Handle` returned by the file system from the `open` method, if any. If the /// file system did not return a `Handle` from `open` then the contents of /// `handle` are undefined. /// /// If this method returns an `ENOSYS` error then the kernel will treat it as success and all /// subsequent calls to `fsync` will be handled by the kernel without being forwarded to the /// file system. fn fsync( &self, ctx: Context, inode: Self::Inode, datasync: bool, handle: Self::Handle, ) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Allocate requested space for file data. /// /// If this function returns success, then the file system must guarantee that it is possible to /// write up to `length` bytes of data starting at `offset` without failing due to a lack of /// free space on the disk. /// /// `handle` is the `Handle` returned by the file system from the `open` method, if any. If the /// file system did not return a `Handle` from `open` then the contents of `handle` are /// undefined. /// /// If this method returns an `ENOSYS` error then the kernel will treat that as a permanent /// failure: all future calls to `fallocate` will fail with `EOPNOTSUPP` without being forwarded /// to the file system. fn fallocate( &self, ctx: Context, inode: Self::Inode, handle: Self::Handle, mode: u32, offset: u64, length: u64, ) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Release an open file. /// /// This method is called when there are no more references to an open file: all file /// descriptors are closed and all memory mappings are unmapped. /// /// For every `open` call there will be exactly one `release` call (unless the file system is /// force-unmounted). /// /// The file system may reply with an error, but error values are not returned to the `close()` /// or `munmap()` which triggered the release. /// /// `handle` is the `Handle` returned by the file system from the `open` method, if any. If the /// file system did not return a `Handle` from `open` then the contents of /// `handle` are undefined. /// /// If `flush` is `true` then the contents of the file should also be flushed to disk. #[allow(clippy::too_many_arguments)] fn release( &self, ctx: Context, inode: Self::Inode, flags: u32, handle: Self::Handle, flush: bool, flock_release: bool, lock_owner: Option, ) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Get information about the file system. fn statfs(&self, ctx: Context, inode: Self::Inode) -> io::Result { // Safe because we are zero-initializing a struct with only POD fields. let mut st: libc::statvfs64 = unsafe { mem::zeroed() }; // This matches the behavior of libfuse as it returns these values if the // filesystem doesn't implement this method. st.f_namemax = 255; st.f_bsize = 512; Ok(st) } /// Set an extended attribute. /// /// If this method fails with an `ENOSYS` error, then the kernel will treat that as a permanent /// failure. The kernel will return `EOPNOTSUPP` for all future calls to `setxattr` without /// forwarding them to the file system. /// /// Valid values for flags are the same as those accepted by the `setxattr(2)` system call and /// have the same behavior. fn setxattr( &self, ctx: Context, inode: Self::Inode, name: &CStr, value: &[u8], flags: u32, extra_flags: SetxattrFlags, ) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Get an extended attribute. /// /// If `size` is 0, then the file system should respond with `GetxattrReply::Count` and the /// number of bytes needed to hold the value. If `size` is large enough to hold the value, then /// the file system should reply with `GetxattrReply::Value` and the value of the extended /// attribute. If `size` is not 0 but is also not large enough to hold the value, then the file /// system should reply with an `ERANGE` error. /// /// If this method fails with an `ENOSYS` error, then the kernel will treat that as a permanent /// failure. The kernel will return `EOPNOTSUPP` for all future calls to `getxattr` without /// forwarding them to the file system. fn getxattr( &self, ctx: Context, inode: Self::Inode, name: &CStr, size: u32, ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// List extended attribute names. /// /// If `size` is 0, then the file system should respond with `ListxattrReply::Count` and the /// number of bytes needed to hold a `\0` byte separated list of the names of all the extended /// attributes. If `size` is large enough to hold the `\0` byte separated list of the attribute /// names, then the file system should reply with `ListxattrReply::Names` and the list. If /// `size` is not 0 but is also not large enough to hold the list, then the file system should /// reply with an `ERANGE` error. /// /// If this method fails with an `ENOSYS` error, then the kernel will treat that as a permanent /// failure. The kernel will return `EOPNOTSUPP` for all future calls to `listxattr` without /// forwarding them to the file system. fn listxattr(&self, ctx: Context, inode: Self::Inode, size: u32) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Remove an extended attribute. /// /// If this method fails with an `ENOSYS` error, then the kernel will treat that as a permanent /// failure. The kernel will return `EOPNOTSUPP` for all future calls to `removexattr` without /// forwarding them to the file system. fn removexattr(&self, ctx: Context, inode: Self::Inode, name: &CStr) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Open a directory for reading. /// /// The file system may choose to return a `Handle` to refer to the newly opened directory. The /// kernel will then use this `Handle` for all operations on the content of the directory /// (`readdir`, `readdirplus`, `fsyncdir`, `releasedir`). If the file system does not return a /// `Handle` then the kernel will use the `Inode` for the directory to operate on its contents. /// In this case the file system may wish to enable the `FsOptions::ZERO_MESSAGE_OPENDIR` /// feature if it is supported by the kernel (see below). /// /// The returned `OpenOptions` allow the file system to change the way the opened directory is /// handled by the kernel. See the documentation of `OpenOptions` for more information. /// /// If the `FsOptions::ZERO_MESSAGE_OPENDIR` feature is enabled by both the file system /// implementation and the kernel, then the file system may return an error of `ENOSYS`. This /// will be interpreted by the kernel as success and future calls to `opendir` and `releasedir` /// will be handled by the kernel without being passed on to the file system. fn opendir( &self, ctx: Context, inode: Self::Inode, flags: u32, ) -> io::Result<(Option, OpenOptions)> { // Matches the behavior of libfuse. Ok((None, OpenOptions::empty())) } /// Read a directory. /// /// `handle` is the `Handle` returned by the file system from the `opendir` method, if any. If /// the file system did not return a `Handle` from `opendir` then the contents of `handle` are /// undefined. /// /// `size` indicates the maximum number of bytes that should be returned by this method. /// /// If `offset` is non-zero then it corresponds to one of the `offset` values from a `DirEntry` /// that was previously returned by a call to `readdir` for the same handle. In this case the /// file system should skip over the entries before the position defined by the `offset` value. /// If entries were added or removed while the `Handle` is open then the file system may still /// include removed entries or skip newly created entries. However, adding or removing entries /// should never cause the file system to skip over unrelated entries or include an entry more /// than once. This means that `offset` cannot be a simple index and must include sufficient /// information to uniquely determine the next entry in the list even when the set of entries is /// being changed. /// /// The file system may return entries for the current directory (".") and parent directory /// ("..") but is not required to do so. If the file system does not return these entries, then /// they are implicitly added by the kernel. /// /// The lookup count for `Inode`s associated with the returned directory entries is **NOT** /// affected by this method. /// fn readdir( &self, ctx: Context, inode: Self::Inode, handle: Self::Handle, size: u32, offset: u64, ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Synchronize the contents of a directory. /// /// File systems must ensure that the directory contents have been flushed to disk before /// returning from this method. If `datasync` is true then only the directory data (but not the /// metadata) needs to be flushed. /// /// `handle` is the `Handle` returned by the file system from the `opendir` method, if any. If /// the file system did not return a `Handle` from `opendir` then the contents of /// `handle` are undefined. /// /// If this method returns an `ENOSYS` error then the kernel will treat it as success and all /// subsequent calls to `fsyncdir` will be handled by the kernel without being forwarded to the /// file system. fn fsyncdir( &self, ctx: Context, inode: Self::Inode, datasync: bool, handle: Self::Handle, ) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Release an open directory. /// /// For every `opendir` call there will be exactly one `releasedir` call (unless the file system /// is force-unmounted). /// /// `handle` is the `Handle` returned by the file system from the `opendir` method, if any. If /// the file system did not return a `Handle` from `opendir` then the contents of `handle` are /// undefined. /// /// `flags` contains used the flags used to open the directory in `opendir`. fn releasedir( &self, ctx: Context, inode: Self::Inode, flags: u32, handle: Self::Handle, ) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Setup a mapping so that guest can access files in DAX style. #[allow(clippy::too_many_arguments)] fn setupmapping( &self, _ctx: Context, inode: Self::Inode, handle: Self::Handle, foffset: u64, len: u64, flags: u64, moffset: u64, vu_req: &mut T, ) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } fn removemapping( &self, _ctx: Context, requests: Vec, vu_req: &mut T, ) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Check file access permissions. /// /// This method is called when a userspace process in the client makes an `access()` or /// `chdir()` system call. If the file system was mounted with the `-o default_permissions` /// mount option, then the kernel will perform these checks itself and this method will not be /// called. /// /// If this method returns an `ENOSYS` error, then the kernel will treat it as a permanent /// success: all future calls to `access` will return success without being forwarded to the /// file system. fn access(&self, ctx: Context, inode: Self::Inode, mask: u32) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Reposition read/write file offset. fn lseek( &self, ctx: Context, inode: Self::Inode, handle: Self::Handle, offset: u64, whence: u32, ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } #[allow(clippy::too_many_arguments)] fn copyfilerange( &self, ctx: Context, inode_in: Self::Inode, handle_in: Self::Handle, offset_in: u64, inode_out: Self::Inode, handle_out: Self::Handle, offset_out: u64, len: u64, flags: u64, ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// Synchronize the filesystem containing the file referenced by `inode`. When running with /// --announce-submounts, `syncfs` is called once per submount that is to be synced. When /// running without--announce-submounts, `syncfs` is called on the root mount, but all submounts /// need to be synced, too. fn syncfs(&self, _ctx: Context, inode: Self::Inode) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// TODO: support this fn getlk(&self) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// TODO: support this fn setlk(&self) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// TODO: support this fn setlkw(&self) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// TODO: support this fn ioctl(&self) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// TODO: support this fn bmap(&self) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// TODO: support this fn poll(&self) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// TODO: support this fn notify_reply(&self) -> io::Result<()> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } /// TODO: support this fn tmpfile(&self) -> io::Result<(Entry, Option, OpenOptions)> { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } } virtiofsd-1.10.0/src/fs_cache_req_handler.rs000064400000000000000000000036361046102023000171600ustar 00000000000000use crate::fuse; use std::io; use std::os::unix::io::RawFd; use vhost::vhost_user::message::{ VhostUserFSBackendMsg, VhostUserFSBackendMsgFlags, VHOST_USER_FS_BACKEND_ENTRIES, }; use vhost::vhost_user::{Backend, VhostUserFrontendReqHandler}; /// Trait for virtio-fs cache requests operations. This is mainly used to hide /// vhost-user details from virtio-fs's fuse part. pub trait FsCacheReqHandler: Send + Sync + 'static { /// Setup a dedicated mapping so that guest can access file data in DAX style. fn map( &mut self, foffset: u64, moffset: u64, len: u64, flags: u64, fd: RawFd, ) -> io::Result<()>; /// Remove those mappings that provide the access to file data. fn unmap(&mut self, requests: Vec) -> io::Result<()>; } impl FsCacheReqHandler for Backend { fn map( &mut self, foffset: u64, moffset: u64, len: u64, flags: u64, fd: RawFd, ) -> io::Result<()> { let mut msg: VhostUserFSBackendMsg = Default::default(); msg.fd_offset[0] = foffset; msg.cache_offset[0] = moffset; msg.len[0] = len; msg.flags[0] = if (flags & fuse::SetupmappingFlags::WRITE.bits()) != 0 { VhostUserFSBackendMsgFlags::MAP_W | VhostUserFSBackendMsgFlags::MAP_R } else { VhostUserFSBackendMsgFlags::MAP_R }; self.fs_backend_map(&msg, &fd)?; Ok(()) } fn unmap(&mut self, requests: Vec) -> io::Result<()> { for chunk in requests.chunks(VHOST_USER_FS_BACKEND_ENTRIES) { let mut msg: VhostUserFSBackendMsg = Default::default(); for (ind, req) in chunk.iter().enumerate() { msg.len[ind] = req.len; msg.cache_offset[ind] = req.moffset; } self.fs_backend_unmap(&msg)?; } Ok(()) } } virtiofsd-1.10.0/src/fuse.rs000064400000000000000000001155101046102023000140160ustar 00000000000000// Copyright 2019 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use std::convert::TryFrom; use std::mem; use crate::macros::enum_value; use bitflags::bitflags; use vm_memory::ByteValued; /// Version number of this interface. pub const KERNEL_VERSION: u32 = 7; /// Minor version number of this interface. pub const KERNEL_MINOR_VERSION: u32 = 38; /// Minimum Minor version number supported. If client sends a minor /// number lesser than this, we don't support it. pub const MIN_KERNEL_MINOR_VERSION: u32 = 27; /// The ID of the inode corresponding to the root directory of the file system. pub const ROOT_ID: u64 = 1; // Bitmasks for `fuse_setattr_in.valid`. const FATTR_MODE: u32 = 1 << 0; const FATTR_UID: u32 = 1 << 1; const FATTR_GID: u32 = 1 << 2; const FATTR_SIZE: u32 = 1 << 3; const FATTR_ATIME: u32 = 1 << 4; const FATTR_MTIME: u32 = 1 << 5; pub const FATTR_FH: u32 = 1 << 6; const FATTR_ATIME_NOW: u32 = 1 << 7; const FATTR_MTIME_NOW: u32 = 1 << 8; pub const FATTR_LOCKOWNER: u32 = 1 << 9; const FATTR_CTIME: u32 = 1 << 10; const FATTR_KILL_SUIDGID: u32 = 1 << 11; bitflags! { pub struct SetattrValid: u32 { const MODE = FATTR_MODE; const UID = FATTR_UID; const GID = FATTR_GID; const SIZE = FATTR_SIZE; const ATIME = FATTR_ATIME; const MTIME = FATTR_MTIME; const ATIME_NOW = FATTR_ATIME_NOW; const MTIME_NOW = FATTR_MTIME_NOW; const CTIME = FATTR_CTIME; const KILL_SUIDGID = FATTR_KILL_SUIDGID; } } // Flags returned by the OPEN request. /// Bypass page cache for this open file. const FOPEN_DIRECT_IO: u32 = 1 << 0; /// Don't invalidate the data cache on open. const FOPEN_KEEP_CACHE: u32 = 1 << 1; /// The file is not seekable. const FOPEN_NONSEEKABLE: u32 = 1 << 2; /// Allow caching this directory. const FOPEN_CACHE_DIR: u32 = 1 << 3; /// The file is stream-like (no file position at all). #[allow(dead_code)] const FOPEN_STREAM: u32 = 1 << 4; /// Don't flush data cache on close (unless FUSE_WRITEBACK_CACHE) const FOPEN_NOFLUSH: u32 = 1 << 5; /// Allow concurrent direct writes on the same inode const FOPEN_PARALLEL_DIRECT_WRITES: u32 = 1 << 6; bitflags! { /// Options controlling the behavior of files opened by the server in response /// to an open or create request. pub struct OpenOptions: u32 { const DIRECT_IO = FOPEN_DIRECT_IO; const KEEP_CACHE = FOPEN_KEEP_CACHE; const NONSEEKABLE = FOPEN_NONSEEKABLE; const CACHE_DIR = FOPEN_CACHE_DIR; const STREAM = FOPEN_CACHE_DIR; const NOFLUSH = FOPEN_NOFLUSH; const PARALLEL_DIRECT_WRITES = FOPEN_PARALLEL_DIRECT_WRITES; } } // INIT request/reply flags. /// Asynchronous read requests. const ASYNC_READ: u64 = 1 << 0; /// Remote locking for POSIX file locks. const POSIX_LOCKS: u64 = 1 << 1; /// Kernel sends file handle for fstat, etc... (not yet supported). const FILE_OPS: u64 = 1 << 2; /// Handles the O_TRUNC open flag in the filesystem. const ATOMIC_O_TRUNC: u64 = 1 << 3; /// FileSystem handles lookups of "." and "..". const EXPORT_SUPPORT: u64 = 1 << 4; /// FileSystem can handle write size larger than 4kB. const BIG_WRITES: u64 = 1 << 5; /// Don't apply umask to file mode on create operations. const DONT_MASK: u64 = 1 << 6; /// Kernel supports splice write on the device. const SPLICE_WRITE: u64 = 1 << 7; /// Kernel supports splice move on the device. const SPLICE_MOVE: u64 = 1 << 8; /// Kernel supports splice read on the device. const SPLICE_READ: u64 = 1 << 9; /// Remote locking for BSD style file locks. const FLOCK_LOCKS: u64 = 1 << 10; /// Kernel supports ioctl on directories. const HAS_IOCTL_DIR: u64 = 1 << 11; /// Automatically invalidate cached pages. const AUTO_INVAL_DATA: u64 = 1 << 12; /// Do READDIRPLUS (READDIR+LOOKUP in one). const DO_READDIRPLUS: u64 = 1 << 13; /// Adaptive readdirplus. const READDIRPLUS_AUTO: u64 = 1 << 14; /// Asynchronous direct I/O submission. const ASYNC_DIO: u64 = 1 << 15; /// Use writeback cache for buffered writes. const WRITEBACK_CACHE: u64 = 1 << 16; /// Kernel supports zero-message opens. const NO_OPEN_SUPPORT: u64 = 1 << 17; /// Allow parallel lookups and readdir. const PARALLEL_DIROPS: u64 = 1 << 18; /// Fs handles killing suid/sgid/cap on write/chown/trunc. const HANDLE_KILLPRIV: u64 = 1 << 19; /// FileSystem supports posix acls. const POSIX_ACL: u64 = 1 << 20; /// Reading the device after abort returns ECONNABORTED. const ABORT_ERROR: u64 = 1 << 21; /// Init_out.max_pages contains the max number of req pages. const MAX_PAGES: u64 = 1 << 22; /// Cache READLINK responses const CACHE_SYMLINKS: u64 = 1 << 23; /// Kernel supports zero-message opendir const NO_OPENDIR_SUPPORT: u64 = 1 << 24; /// Only invalidate cached pages on explicit request const EXPLICIT_INVAL_DATA: u64 = 1 << 25; /// init_out.map_alignment contains log2(byte alignment) for /// foffset and moffset fields in struct fuse_setupmapping_out and /// fuse_removemapping_one #[allow(dead_code)] const MAP_ALIGNMENT: u64 = 1 << 26; /// Kernel supports auto-mounting directory submounts const SUBMOUNTS: u64 = 1 << 27; /// Fs handles killing suid/sgid/cap on write/chown/trunc (v2). const HANDLE_KILLPRIV_V2: u64 = 1 << 28; /// Server supports extended struct SetxattrIn const SETXATTR_EXT: u64 = 1 << 29; /// Extended fuse_init_in request const INIT_EXT: u64 = 1 << 30; /// Reserved. Do not use. const INIT_RESERVED: u64 = 1 << 31; /// Add security context to create, mkdir, symlink, and mknod const SECURITY_CTX: u64 = 1 << 32; /// Use per inode DAX const HAS_INODE_DAX: u64 = 1 << 33; /// Add supplementary groups info to create, mkdir, symlink /// and mknod (single group that matches parent) const CREATE_SUPP_GROUP: u64 = 1 << 34; bitflags! { /// A bitfield passed in as a parameter to and returned from the `init` method of the /// `FileSystem` trait. pub struct FsOptions: u64 { /// Indicates that the filesystem supports asynchronous read requests. /// /// If this capability is not requested/available, the kernel will ensure that there is at /// most one pending read request per file-handle at any time, and will attempt to order /// read requests by increasing offset. /// /// This feature is enabled by default when supported by the kernel. const ASYNC_READ = ASYNC_READ; /// Indicates that the filesystem supports "remote" locking. /// /// This feature is not enabled by default and should only be set if the filesystem /// implements the `getlk` and `setlk` methods of the `FileSystem` trait. const POSIX_LOCKS = POSIX_LOCKS; /// Kernel sends file handle for fstat, etc... (not yet supported). const FILE_OPS = FILE_OPS; /// Indicates that the filesystem supports the `O_TRUNC` open flag. If disabled, and an /// application specifies `O_TRUNC`, fuse first calls `setattr` to truncate the file and /// then calls `open` with `O_TRUNC` filtered out. /// /// This feature is enabled by default when supported by the kernel. const ATOMIC_O_TRUNC = ATOMIC_O_TRUNC; /// Indicates that the filesystem supports lookups of "." and "..". /// /// This feature is disabled by default. const EXPORT_SUPPORT = EXPORT_SUPPORT; /// FileSystem can handle write size larger than 4kB. const BIG_WRITES = BIG_WRITES; /// Indicates that the kernel should not apply the umask to the file mode on create /// operations. /// /// This feature is disabled by default. const DONT_MASK = DONT_MASK; /// Indicates that the server should try to use `splice(2)` when writing to the fuse device. /// This may improve performance. /// /// This feature is not currently supported. const SPLICE_WRITE = SPLICE_WRITE; /// Indicates that the server should try to move pages instead of copying when writing to / /// reading from the fuse device. This may improve performance. /// /// This feature is not currently supported. const SPLICE_MOVE = SPLICE_MOVE; /// Indicates that the server should try to use `splice(2)` when reading from the fuse /// device. This may improve performance. /// /// This feature is not currently supported. const SPLICE_READ = SPLICE_READ; /// If set, then calls to `flock` will be emulated using POSIX locks and must /// then be handled by the filesystem's `setlock()` handler. /// /// If not set, `flock` calls will be handled by the FUSE kernel module internally (so any /// access that does not go through the kernel cannot be taken into account). /// /// This feature is disabled by default. const FLOCK_LOCKS = FLOCK_LOCKS; /// Indicates that the filesystem supports ioctl's on directories. /// /// This feature is enabled by default when supported by the kernel. const HAS_IOCTL_DIR = HAS_IOCTL_DIR; /// Traditionally, while a file is open the FUSE kernel module only asks the filesystem for /// an update of the file's attributes when a client attempts to read beyond EOF. This is /// unsuitable for e.g. network filesystems, where the file contents may change without the /// kernel knowing about it. /// /// If this flag is set, FUSE will check the validity of the attributes on every read. If /// the attributes are no longer valid (i.e., if the *attribute* timeout has expired) then /// FUSE will first send another `getattr` request. If the new mtime differs from the /// previous value, any cached file *contents* will be invalidated as well. /// /// This flag should always be set when available. If all file changes go through the /// kernel, *attribute* validity should be set to a very large number to avoid unnecessary /// `getattr()` calls. /// /// This feature is enabled by default when supported by the kernel. const AUTO_INVAL_DATA = AUTO_INVAL_DATA; /// Indicates that the filesystem supports readdirplus. /// /// The feature is not enabled by default and should only be set if the filesystem /// implements the `readdirplus` method of the `FileSystem` trait. const DO_READDIRPLUS = DO_READDIRPLUS; /// Indicates that the filesystem supports adaptive readdirplus. /// /// If `DO_READDIRPLUS` is not set, this flag has no effect. /// /// If `DO_READDIRPLUS` is set and this flag is not set, the kernel will always issue /// `readdirplus()` requests to retrieve directory contents. /// /// If `DO_READDIRPLUS` is set and this flag is set, the kernel will issue both `readdir()` /// and `readdirplus()` requests, depending on how much information is expected to be /// required. /// /// This feature is not enabled by default and should only be set if the file system /// implements both the `readdir` and `readdirplus` methods of the `FileSystem` trait. const READDIRPLUS_AUTO = READDIRPLUS_AUTO; /// Indicates that the filesystem supports asynchronous direct I/O submission. /// /// If this capability is not requested/available, the kernel will ensure that there is at /// most one pending read and one pending write request per direct I/O file-handle at any /// time. /// /// This feature is enabled by default when supported by the kernel. const ASYNC_DIO = ASYNC_DIO; /// Indicates that writeback caching should be enabled. This means that individual write /// request may be buffered and merged in the kernel before they are sent to the file /// system. /// /// This feature is disabled by default. const WRITEBACK_CACHE = WRITEBACK_CACHE; /// Indicates support for zero-message opens. If this flag is set in the `capable` parameter /// of the `init` trait method, then the file system may return `ENOSYS` from the open() handler /// to indicate success. Further attempts to open files will be handled in the kernel. (If /// this flag is not set, returning ENOSYS will be treated as an error and signaled to the /// caller). /// /// Setting (or not setting) the field in the `FsOptions` returned from the `init` method /// has no effect. const ZERO_MESSAGE_OPEN = NO_OPEN_SUPPORT; /// Indicates support for parallel directory operations. If this flag is unset, the FUSE /// kernel module will ensure that lookup() and readdir() requests are never issued /// concurrently for the same directory. /// /// This feature is enabled by default when supported by the kernel. const PARALLEL_DIROPS = PARALLEL_DIROPS; /// Indicates that the file system is responsible for unsetting setuid and setgid bits when a /// file is written, truncated, or its owner is changed. /// /// This feature is not currently supported. const HANDLE_KILLPRIV = HANDLE_KILLPRIV; /// Indicates support for POSIX ACLs. /// /// If this feature is enabled, the kernel will cache and have responsibility for enforcing /// ACLs. ACL will be stored as xattrs and passed to userspace, which is responsible for /// updating the ACLs in the filesystem, keeping the file mode in sync with the ACL, and /// ensuring inheritance of default ACLs when new filesystem nodes are created. Note that /// this requires that the file system is able to parse and interpret the xattr /// representation of ACLs. /// /// Enabling this feature implicitly turns on the `default_permissions` mount option (even /// if it was not passed to mount(2)). /// /// This feature is disabled by default. const POSIX_ACL = POSIX_ACL; /// Indicates that if the connection is gone because of sysfs abort, reading from the device /// will return -ECONNABORTED. /// /// This feature is not currently supported. const ABORT_ERROR = ABORT_ERROR; /// Indicates support for negotiating the maximum number of pages supported. /// /// If this feature is enabled, we can tell the kernel the maximum number of pages that we /// support to transfer in a single request. /// /// This feature is enabled by default if supported by the kernel. const MAX_PAGES = MAX_PAGES; /// Indicates that the kernel supports caching READLINK responses. /// /// This feature is not currently supported. const CACHE_SYMLINKS = CACHE_SYMLINKS; /// Indicates support for zero-message opens. If this flag is set in the `capable` parameter /// of the `init` trait method, then the file system may return `ENOSYS` from the opendir() handler /// to indicate success. Further attempts to open directories will be handled in the kernel. (If /// this flag is not set, returning ENOSYS will be treated as an error and signaled to the /// caller). /// /// Setting (or not setting) the field in the `FsOptions` returned from the `init` method /// has no effect. const ZERO_MESSAGE_OPENDIR = NO_OPENDIR_SUPPORT; /// Indicates support for explicit data invalidation. If this feature is enabled, the /// server is fully responsible for data cache invalidation, and the kernel won't /// invalidate files data cache on size change and only truncate that cache to new size /// in case the size decreased. /// /// This feature is not currently supported. const EXPLICIT_INVAL_DATA = EXPLICIT_INVAL_DATA; /// Indicates that the kernel supports the FUSE_ATTR_SUBMOUNT flag. /// /// Setting (or not setting) this flag in the `FsOptions` returned from the `init` method /// has no effect. const SUBMOUNTS = SUBMOUNTS; /// Indicates that the filesystem is responsible for clearing /// security.capability xattr and clearing setuid and setgid bits. Following /// are the rules. /// - clear "security.capability" on write, truncate and chown unconditionally /// - clear suid/sgid if following is true. Note, sgid is cleared only if /// group executable bit is set. /// o setattr has FATTR_SIZE and FATTR_KILL_SUIDGID set. /// o setattr has FATTR_UID or FATTR_GID /// o open has O_TRUNC and FUSE_OPEN_KILL_SUIDGID /// o create has O_TRUNC and FUSE_OPEN_KILL_SUIDGID flag set. /// o write has FUSE_WRITE_KILL_SUIDGID /// /// This feature is enabled by default if supported by the kernel. const HANDLE_KILLPRIV_V2 = HANDLE_KILLPRIV_V2; /// Server supports extended struct SetxattrIn const SETXATTR_EXT = SETXATTR_EXT; /// Indicates that fuse_init_in structure has been extended and /// expect extended struct coming in from kernel. const INIT_EXT = INIT_EXT; /// This bit is reserved. Don't use it. const INIT_RESERVED = INIT_RESERVED; /// Indicates that kernel is capable of sending a security /// context at file creation time (create, mkdir, symlink /// and mknod). This is expected to be a SELinux security /// context as of now. const SECURITY_CTX = SECURITY_CTX; /// Indicates that kernel is capable of understanding /// per inode dax flag sent in response to getattr /// request. This will allow server to enable to /// enable dax on selective files. const HAS_INODE_DAX = HAS_INODE_DAX; /// Add supplementary groups info to create, mkdir, symlink /// and mknod (single group that matches parent). const CREATE_SUPP_GROUP = CREATE_SUPP_GROUP; } } // Release flags. pub const RELEASE_FLUSH: u32 = 1 << 0; pub const RELEASE_FLOCK_UNLOCK: u32 = 1 << 1; // Getattr flags. pub const GETATTR_FH: u32 = 1 << 0; // Lock flags. pub const LK_FLOCK: u32 = 1 << 0; // Write flags. /// Delayed write from page cache, file handle is guessed. pub const WRITE_CACHE: u32 = 1 << 0; /// `lock_owner` field is valid. pub const WRITE_LOCKOWNER: u32 = 1 << 1; /// Kill suid and sgid bits pub const WRITE_KILL_PRIV: u32 = 1 << 2; // Read flags. pub const READ_LOCKOWNER: u32 = 1 << 1; // Ioctl flags. /// 32bit compat ioctl on 64bit machine const IOCTL_COMPAT: u32 = 1 << 0; /// Not restricted to well-formed ioctls, retry allowed const IOCTL_UNRESTRICTED: u32 = 1 << 1; /// Retry with new iovecs const IOCTL_RETRY: u32 = 1 << 2; /// 32bit ioctl const IOCTL_32BIT: u32 = 1 << 3; /// Is a directory const IOCTL_DIR: u32 = 1 << 4; /// x32 compat ioctl on 64bit machine (64bit time_t) const IOCTL_COMPAT_X32: u32 = 1 << 5; /// Maximum of in_iovecs + out_iovecs const IOCTL_MAX_IOV: u32 = 256; bitflags! { pub struct IoctlFlags: u32 { /// 32bit compat ioctl on 64bit machine const IOCTL_COMPAT = IOCTL_COMPAT; /// Not restricted to well-formed ioctls, retry allowed const IOCTL_UNRESTRICTED = IOCTL_UNRESTRICTED; /// Retry with new iovecs const IOCTL_RETRY = IOCTL_RETRY; /// 32bit ioctl const IOCTL_32BIT = IOCTL_32BIT; /// Is a directory const IOCTL_DIR = IOCTL_DIR; /// x32 compat ioctl on 64bit machine (64bit time_t) const IOCTL_COMPAT_X32 = IOCTL_COMPAT_X32; /// Maximum of in_iovecs + out_iovecs const IOCTL_MAX_IOV = IOCTL_MAX_IOV; } } /// Request poll notify. pub const POLL_SCHEDULE_NOTIFY: u32 = 1 << 0; /// The read buffer is required to be at least 8k, but may be much larger. pub const FUSE_MIN_READ_BUFFER: u32 = 8192; pub const FUSE_COMPAT_ENTRY_OUT_SIZE: u32 = 120; pub const FUSE_COMPAT_ATTR_OUT_SIZE: u32 = 96; pub const FUSE_COMPAT_MKNOD_IN_SIZE: u32 = 8; pub const FUSE_COMPAT_WRITE_IN_SIZE: u32 = 24; pub const FUSE_COMPAT_STATFS_SIZE: u32 = 48; pub const FUSE_COMPAT_INIT_OUT_SIZE: u32 = 8; pub const FUSE_COMPAT_22_INIT_OUT_SIZE: u32 = 24; pub const FUSE_COMPAT_SETXATTR_IN_SIZE: u32 = 8; // Fsync flags pub const FSYNC_FDATASYNC: u32 = 1 << 0; // Attr.flags flags. /// Object is a submount root pub const ATTR_SUBMOUNT: u32 = 1 << 0; /// Indicate to kernel to enable DAX for this file in per inode DAX mode pub const ATTR_DAX: u32 = 1 << 1; // Open flags /// Kill suid and sgid if executable pub const OPEN_KILL_SUIDGID: u32 = 1 << 0; // setxattr flags /// Clear SGID when system.posix_acl_access is set const SETXATTR_ACL_KILL_SGID: u32 = 1 << 0; bitflags! { pub struct SetxattrFlags: u32 { /// Clear SGID when system.posix_acl_access is set const SETXATTR_ACL_KILL_SGID = SETXATTR_ACL_KILL_SGID; } } // Message definitions follow. It is safe to implement ByteValued for all of these // because they are POD types. #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct Attr { pub ino: u64, pub size: u64, pub blocks: u64, pub atime: u64, pub mtime: u64, pub ctime: u64, pub atimensec: u32, pub mtimensec: u32, pub ctimensec: u32, pub mode: u32, pub nlink: u32, pub uid: u32, pub gid: u32, pub rdev: u32, pub blksize: u32, pub flags: u32, } unsafe impl ByteValued for Attr {} impl From for Attr { fn from(st: libc::stat64) -> Attr { Attr::with_flags(st, 0) } } impl Attr { pub fn with_flags(st: libc::stat64, flags: u32) -> Attr { Attr { ino: st.st_ino, size: st.st_size as u64, blocks: st.st_blocks as u64, atime: st.st_atime as u64, mtime: st.st_mtime as u64, ctime: st.st_ctime as u64, atimensec: st.st_atime_nsec as u32, mtimensec: st.st_mtime_nsec as u32, ctimensec: st.st_ctime_nsec as u32, mode: st.st_mode, nlink: st.st_nlink as u32, uid: st.st_uid, gid: st.st_gid, rdev: st.st_rdev as u32, blksize: st.st_blksize as u32, flags, } } } #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct Kstatfs { pub blocks: u64, pub bfree: u64, pub bavail: u64, pub files: u64, pub ffree: u64, pub bsize: u32, pub namelen: u32, pub frsize: u32, pub padding: u32, pub spare: [u32; 6], } unsafe impl ByteValued for Kstatfs {} impl From for Kstatfs { fn from(st: libc::statvfs64) -> Self { Kstatfs { blocks: st.f_blocks, bfree: st.f_bfree, bavail: st.f_bavail, files: st.f_files, ffree: st.f_ffree, bsize: st.f_bsize as u32, namelen: st.f_namemax as u32, frsize: st.f_frsize as u32, ..Default::default() } } } #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct FileLock { pub start: u64, pub end: u64, pub type_: u32, pub pid: u32, /* tgid */ } unsafe impl ByteValued for FileLock {} enum_value! { #[derive(Debug, Copy, Clone)] pub enum Opcode: u32 { Lookup = 1, Forget = 2, /* No Reply */ Getattr = 3, Setattr = 4, Readlink = 5, Symlink = 6, Mknod = 8, Mkdir = 9, Unlink = 10, Rmdir = 11, Rename = 12, Link = 13, Open = 14, Read = 15, Write = 16, Statfs = 17, Release = 18, Fsync = 20, Setxattr = 21, Getxattr = 22, Listxattr = 23, Removexattr = 24, Flush = 25, Init = 26, Opendir = 27, Readdir = 28, Releasedir = 29, Fsyncdir = 30, Getlk = 31, Setlk = 32, Setlkw = 33, Access = 34, Create = 35, Interrupt = 36, Bmap = 37, Destroy = 38, Ioctl = 39, Poll = 40, NotifyReply = 41, BatchForget = 42, Fallocate = 43, Readdirplus = 44, Rename2 = 45, Lseek = 46, CopyFileRange = 47, SetupMapping = 48, RemoveMapping = 49, Syncfs = 50, TmpFile = 51, } } #[repr(u32)] #[derive(Debug, Copy, Clone)] pub enum NotifyOpcode { Poll = 1, InvalInode = 2, InvalEntry = 3, Store = 4, Retrieve = 5, Delete = 6, CodeMax = 7, } #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct EntryOut { pub nodeid: u64, /* Inode ID */ pub generation: u64, /* Inode generation: nodeid:gen must be unique for the fs's lifetime */ pub entry_valid: u64, /* Cache timeout for the name */ pub attr_valid: u64, /* Cache timeout for the attributes */ pub entry_valid_nsec: u32, pub attr_valid_nsec: u32, pub attr: Attr, } unsafe impl ByteValued for EntryOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct ForgetIn { pub nlookup: u64, } unsafe impl ByteValued for ForgetIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct ForgetOne { pub nodeid: u64, pub nlookup: u64, } unsafe impl ByteValued for ForgetOne {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct BatchForgetIn { pub count: u32, pub dummy: u32, } unsafe impl ByteValued for BatchForgetIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct GetattrIn { pub flags: u32, pub dummy: u32, pub fh: u64, } unsafe impl ByteValued for GetattrIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct AttrOut { pub attr_valid: u64, /* Cache timeout for the attributes */ pub attr_valid_nsec: u32, pub dummy: u32, pub attr: Attr, } unsafe impl ByteValued for AttrOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct MknodIn { pub mode: u32, pub rdev: u32, pub umask: u32, pub padding: u32, } unsafe impl ByteValued for MknodIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct MkdirIn { pub mode: u32, pub umask: u32, } unsafe impl ByteValued for MkdirIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct RenameIn { pub newdir: u64, } unsafe impl ByteValued for RenameIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct Rename2In { pub newdir: u64, pub flags: u32, pub padding: u32, } unsafe impl ByteValued for Rename2In {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct LinkIn { pub oldnodeid: u64, } unsafe impl ByteValued for LinkIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct SetattrIn { pub valid: u32, pub padding: u32, pub fh: u64, pub size: u64, pub lock_owner: u64, pub atime: u64, pub mtime: u64, pub ctime: u64, pub atimensec: u32, pub mtimensec: u32, pub ctimensec: u32, pub mode: u32, pub unused4: u32, pub uid: u32, pub gid: u32, pub unused5: u32, } unsafe impl ByteValued for SetattrIn {} impl From for libc::stat64 { fn from(sai: SetattrIn) -> libc::stat64 { let mut out: libc::stat64 = unsafe { mem::zeroed() }; out.st_mode = sai.mode; out.st_uid = sai.uid; out.st_gid = sai.gid; out.st_size = sai.size as i64; out.st_atime = sai.atime as i64; out.st_mtime = sai.mtime as i64; out.st_ctime = sai.ctime as i64; out.st_atime_nsec = sai.atimensec.into(); out.st_mtime_nsec = sai.mtimensec.into(); out.st_ctime_nsec = sai.ctimensec.into(); out } } #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct OpenIn { pub flags: u32, pub open_flags: u32, } unsafe impl ByteValued for OpenIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct CreateIn { pub flags: u32, pub mode: u32, pub umask: u32, pub open_flags: u32, } unsafe impl ByteValued for CreateIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct OpenOut { pub fh: u64, pub open_flags: u32, pub padding: u32, } unsafe impl ByteValued for OpenOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct ReleaseIn { pub fh: u64, pub flags: u32, pub release_flags: u32, pub lock_owner: u64, } unsafe impl ByteValued for ReleaseIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct FlushIn { pub fh: u64, pub unused: u32, pub padding: u32, pub lock_owner: u64, } unsafe impl ByteValued for FlushIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct ReadIn { pub fh: u64, pub offset: u64, pub size: u32, pub read_flags: u32, pub lock_owner: u64, pub flags: u32, pub padding: u32, } unsafe impl ByteValued for ReadIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct WriteIn { pub fh: u64, pub offset: u64, pub size: u32, pub write_flags: u32, pub lock_owner: u64, pub flags: u32, pub padding: u32, } unsafe impl ByteValued for WriteIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct WriteOut { pub size: u32, pub padding: u32, } unsafe impl ByteValued for WriteOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct StatfsOut { pub st: Kstatfs, } unsafe impl ByteValued for StatfsOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct FsyncIn { pub fh: u64, pub fsync_flags: u32, pub padding: u32, } unsafe impl ByteValued for FsyncIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct SetxattrIn { pub size: u32, pub flags: u32, pub setxattr_flags: u32, pub padding: u32, } unsafe impl ByteValued for SetxattrIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct SetxattrInCompat { pub size: u32, pub flags: u32, } unsafe impl ByteValued for SetxattrInCompat {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct GetxattrIn { pub size: u32, pub padding: u32, } unsafe impl ByteValued for GetxattrIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct GetxattrOut { pub size: u32, pub padding: u32, } unsafe impl ByteValued for GetxattrOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct LkIn { pub fh: u64, pub owner: u64, pub lk: FileLock, pub lk_flags: u32, pub padding: u32, } unsafe impl ByteValued for LkIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct LkOut { pub lk: FileLock, } unsafe impl ByteValued for LkOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct AccessIn { pub mask: u32, pub padding: u32, } unsafe impl ByteValued for AccessIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct InitInCompat { pub major: u32, pub minor: u32, pub max_readahead: u32, pub flags: u32, } unsafe impl ByteValued for InitInCompat {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct InitInExt { pub flags2: u32, pub unused: [u32; 11], } unsafe impl ByteValued for InitInExt {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct InitOut { pub major: u32, pub minor: u32, pub max_readahead: u32, pub flags: u32, pub max_background: u16, pub congestion_threshold: u16, pub max_write: u32, pub time_gran: u32, pub max_pages: u16, pub map_alignment: u16, pub flags2: u32, pub unused: [u32; 7], } unsafe impl ByteValued for InitOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct InterruptIn { pub unique: u64, } unsafe impl ByteValued for InterruptIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct BmapIn { pub block: u64, pub blocksize: u32, pub padding: u32, } unsafe impl ByteValued for BmapIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct BmapOut { pub block: u64, } unsafe impl ByteValued for BmapOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct IoctlIn { pub fh: u64, pub flags: u32, pub cmd: u32, pub arg: u64, pub in_size: u32, pub out_size: u32, } unsafe impl ByteValued for IoctlIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct IoctlIovec { pub base: u64, pub len: u64, } unsafe impl ByteValued for IoctlIovec {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct IoctlOut { pub result: i32, pub flags: u32, pub in_iovs: u32, pub out_iovs: u32, } unsafe impl ByteValued for IoctlOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct PollIn { pub fh: u64, pub kh: u64, pub flags: u32, pub events: u32, } unsafe impl ByteValued for PollIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct PollOut { pub revents: u32, pub padding: u32, } unsafe impl ByteValued for PollOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct NotifyPollWakeupOut { pub kh: u64, } unsafe impl ByteValued for NotifyPollWakeupOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct FallocateIn { pub fh: u64, pub offset: u64, pub length: u64, pub mode: u32, pub padding: u32, } unsafe impl ByteValued for FallocateIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct InHeader { pub len: u32, pub opcode: u32, pub unique: u64, pub nodeid: u64, pub uid: u32, pub gid: u32, pub pid: u32, pub total_extlen: u16, // length of extensions in 8-byte units pub padding: u16, } unsafe impl ByteValued for InHeader {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct OutHeader { pub len: u32, pub error: i32, pub unique: u64, } unsafe impl ByteValued for OutHeader {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct Dirent { pub ino: u64, pub off: u64, pub namelen: u32, pub type_: u32, // char name[]; } unsafe impl ByteValued for Dirent {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct Direntplus { pub entry_out: EntryOut, pub dirent: Dirent, } unsafe impl ByteValued for Direntplus {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct NotifyInvalInodeOut { pub ino: u64, pub off: i64, pub len: i64, } unsafe impl ByteValued for NotifyInvalInodeOut {} const FUSE_EXPIRE_ONLY: u32 = 1 << 0; bitflags! { pub struct NotifyInvalEntryOutFlags: u32 { const EXPIRE_ONLY = FUSE_EXPIRE_ONLY; } } #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct NotifyInvalEntryOut { pub parent: u64, pub namelen: u32, pub flags: u32, } unsafe impl ByteValued for NotifyInvalEntryOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct NotifyDeleteOut { pub parent: u64, pub child: u64, pub namelen: u32, pub padding: u32, } unsafe impl ByteValued for NotifyDeleteOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct NotifyStoreOut { pub nodeid: u64, pub offset: u64, pub size: u32, pub padding: u32, } unsafe impl ByteValued for NotifyStoreOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct Notify_Retrieve_Out { pub notify_unique: u64, pub nodeid: u64, pub offset: u64, pub size: u32, pub padding: u32, } unsafe impl ByteValued for Notify_Retrieve_Out {} /* Matches the size of fuse_write_in */ #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct NotifyRetrieveIn { pub dummy1: u64, pub offset: u64, pub size: u32, pub dummy2: u32, pub dummy3: u64, pub dummy4: u64, } unsafe impl ByteValued for NotifyRetrieveIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct LseekIn { pub fh: u64, pub offset: u64, pub whence: u32, pub padding: u32, } unsafe impl ByteValued for LseekIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct LseekOut { pub offset: u64, } unsafe impl ByteValued for LseekOut {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct CopyfilerangeIn { pub fh_in: u64, pub off_in: u64, pub nodeid_out: u64, pub fh_out: u64, pub off_out: u64, pub len: u64, pub flags: u64, } unsafe impl ByteValued for CopyfilerangeIn {} const SETUPMAPPING_FLAG_WRITE: u64 = 1 << 0; const SETUPMAPPING_FLAG_READ: u64 = 1 << 1; bitflags! { pub struct SetupmappingFlags: u64 { const WRITE = SETUPMAPPING_FLAG_WRITE; const READ = SETUPMAPPING_FLAG_READ; } } #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct SetupmappingIn { pub fh: u64, pub foffset: u64, pub len: u64, pub flags: u64, pub moffset: u64, } unsafe impl ByteValued for SetupmappingIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct RemovemappingIn { pub count: u32, } unsafe impl ByteValued for RemovemappingIn {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct RemovemappingOne { pub moffset: u64, pub len: u64, } unsafe impl ByteValued for RemovemappingOne {} #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct SyncfsIn { pub padding: u64, } unsafe impl ByteValued for SyncfsIn {} /// Extension header /// `size`: total size of this extension including this header /// `ext_type`: type of extension /// This is made compatible with `SecctxHeader` by using type values > `FUSE_MAX_NR_SECCTX` #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct ExtHeader { pub size: u32, pub ext_type: u32, } /// Extension types /// Types `0..MAX_NR_SECCTX` are reserved for `SecCtx` extension for backward compatibility. const MAX_NR_SECCTX: u32 = 31; // Maximum value of `SecctxHeader::nr_secctx` const EXT_SUP_GROUPS: u32 = 32; unsafe impl ByteValued for ExtHeader {} /// Extension type #[derive(Debug, Copy, Clone)] pub enum ExtType { /// Security contexts SecCtx(u32), /// `Supplementary groups SupGroups, } impl TryFrom for ExtType { type Error = (); fn try_from(value: u32) -> Result { match value { v if v <= MAX_NR_SECCTX => Ok(Self::SecCtx(value)), v if v == EXT_SUP_GROUPS => Ok(Self::SupGroups), _ => Err(()), } } } /// For each security context, send `Secctx` with size of security context /// `Secctx` will be followed by security context name and this in turn /// will be followed by actual context label. /// `Secctx`, name, context #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct Secctx { pub size: u32, pub padding: u32, } unsafe impl ByteValued for Secctx {} /// Contains the information about how many `Secctx` structures are being /// sent and what's the total size of all security contexts (including /// size of `SecctxHeader`). #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct SecctxHeader { pub size: u32, pub nr_secctx: u32, } unsafe impl ByteValued for SecctxHeader {} /// Supplementary groups extension /// `nr_groups`: number of supplementary groups /// `groups`: flexible array of group IDs #[repr(C)] #[derive(Debug, Default, Copy, Clone)] pub struct SuppGroups { pub nr_groups: u32, // uint32_t groups[]; } unsafe impl ByteValued for SuppGroups {} virtiofsd-1.10.0/src/idmap.rs000064400000000000000000000064201046102023000141450ustar 00000000000000// SPDX-License-Identifier: BSD-3-Clause use std::fmt; use std::num::ParseIntError; use std::str::FromStr; /// Expected error conditions with respect to parsing both UidMap and GidMap #[derive(Debug, Eq, PartialEq)] pub enum IdMapError { /// A delimiter has been found that does not match the delimiter the map started with. InvalidDelimiter, /// The map is empty or incorrect number of values are provided. IncompleteMap, /// Wraps the cause of parsing an integer failing. InvalidValue(ParseIntError), } impl std::error::Error for IdMapError {} impl fmt::Display for IdMapError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { IdMapError::InvalidDelimiter => write!( f, "A delimiter has been found that does not match the delimiter the map started with" ), IdMapError::IncompleteMap => write!( f, "The map is empty or incorrect number of values are provided" ), IdMapError::InvalidValue(err) => write!(f, "{}", err), } } } impl From for IdMapError { fn from(err: ParseIntError) -> Self { IdMapError::InvalidValue(err) } } #[derive(Clone, Debug, PartialEq, Eq)] pub struct UidMap { pub inside_uid: u32, pub outside_uid: u32, pub count: u32, } impl FromStr for UidMap { type Err = IdMapError; fn from_str(s: &str) -> std::result::Result { let fields = parse_idmap(s, 3)?; Ok(UidMap { inside_uid: fields[0], outside_uid: fields[1], count: fields[2], }) } } impl fmt::Display for UidMap { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, ":{}:{}:{}:", self.inside_uid, self.outside_uid, self.count ) } } #[derive(Clone, Debug, PartialEq, Eq)] pub struct GidMap { pub inside_gid: u32, pub outside_gid: u32, pub count: u32, } impl FromStr for GidMap { type Err = IdMapError; fn from_str(s: &str) -> std::result::Result { let fields = parse_idmap(s, 3)?; Ok(GidMap { inside_gid: fields[0], outside_gid: fields[1], count: fields[2], }) } } impl fmt::Display for GidMap { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, ":{}:{}:{}:", self.inside_gid, self.outside_gid, self.count ) } } fn parse_idmap(s: &str, expected_len: usize) -> std::result::Result, IdMapError> { let mut s = String::from(s); let delimiter = s.pop().ok_or(IdMapError::IncompleteMap)?; if delimiter.is_alphanumeric() { return Err(IdMapError::InvalidDelimiter); } let values: Vec<&str> = s .strip_prefix(delimiter) .ok_or(IdMapError::InvalidDelimiter)? .split(delimiter) .collect(); if values.len() != expected_len { return Err(IdMapError::IncompleteMap); } values .into_iter() .map(|v| v.parse().map_err(IdMapError::InvalidValue)) .collect() } #[derive(Debug, Eq, PartialEq)] #[repr(u8)] pub(crate) enum IdMapSetUpPipeMessage { Request = 0x1, Done = 0x2, } virtiofsd-1.10.0/src/lib.rs000064400000000000000000000046161046102023000136260ustar 00000000000000// Copyright © 2019 Intel Corporation // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause #[macro_use] extern crate log; pub mod descriptor_utils; pub mod file_traits; pub mod filesystem; pub mod fs_cache_req_handler; pub mod fuse; pub mod idmap; pub mod limits; pub mod macros; pub mod oslib; pub mod passthrough; pub mod read_dir; pub mod sandbox; pub mod seccomp; pub mod server; pub mod util; use std::ffi::{FromBytesWithNulError, FromVecWithNulError}; use std::{error, fmt, io}; #[derive(Debug)] pub enum Error { /// Failed to decode protocol messages. DecodeMessage(io::Error), /// Failed to encode protocol messages. EncodeMessage(io::Error), /// Failed to flush protocol messages. FlushMessage(io::Error), /// One or more parameters are missing. MissingParameter, /// A C string parameter is invalid. InvalidCString(FromBytesWithNulError), /// A C string parameter is invalid. InvalidCString2(FromVecWithNulError), /// The `len` field of the header is too small. InvalidHeaderLength, /// The `size` field of the `SetxattrIn` message does not match the length /// of the decoded value. InvalidXattrSize((u32, usize)), /// One or more extensions are missing. MissingExtension, } impl error::Error for Error {} impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use Error::*; match self { DecodeMessage(err) => write!(f, "failed to decode fuse message: {err}"), EncodeMessage(err) => write!(f, "failed to encode fuse message: {err}"), FlushMessage(err) => write!(f, "failed to flush fuse message: {err}"), MissingParameter => write!(f, "one or more parameters are missing"), InvalidHeaderLength => write!(f, "the `len` field of the header is too small"), InvalidCString(err) => write!(f, "a c string parameter is invalid: {err}"), InvalidCString2(err) => write!(f, "a c string parameter is invalid: {err}"), InvalidXattrSize((size, len)) => write!( f, "The `size` field of the `SetxattrIn` message does not match the length of the\ decoded value: size = {size}, value.len() = {len}" ), MissingExtension => write!(f, "one or more extensions are missing"), } } } pub type Result = ::std::result::Result; virtiofsd-1.10.0/src/limits.rs000064400000000000000000000052351046102023000143570ustar 00000000000000// SPDX-License-Identifier: BSD-3-Clause use libc::{getrlimit, rlim_t, rlimit, setrlimit, RLIMIT_NOFILE}; use std::mem::MaybeUninit; use std::result::Result; use std::{cmp, fs, io}; // Default number of open files (RLIMIT_NOFILE) const DEFAULT_NOFILE: rlim_t = 1_000_000; /// Gets the maximum number of open files. fn get_max_nofile() -> Result { let path = "/proc/sys/fs/nr_open"; let max_str = fs::read_to_string(path).map_err(|error| format!("Reading {path}: {error:?}"))?; max_str .trim() .parse() .map_err(|error| format!("Parsing {path}: {error:?}")) } /// Gets the hard limit of open files. fn get_nofile_limits() -> Result { let mut limits = MaybeUninit::::zeroed(); let ret = unsafe { getrlimit(RLIMIT_NOFILE, limits.as_mut_ptr()) }; if ret != 0 { return Err(format!("getrlimit: {}", io::Error::last_os_error())); } Ok(unsafe { limits.assume_init() }) } /// Sets the limit of open files to the given value. fn setup_rlimit_nofile_to(nofile: rlim_t) -> Result<(), String> { let rlimit = rlimit { rlim_cur: nofile, rlim_max: nofile, }; let ret = unsafe { setrlimit(RLIMIT_NOFILE, &rlimit) }; if ret < 0 { Err(format!( "Failed to increase the limit: {:?}", io::Error::last_os_error() )) } else { Ok(()) } } pub fn setup_rlimit_nofile(nofile: Option) -> Result<(), String> { let max_nofile = get_max_nofile()?; let rlimit { rlim_cur, rlim_max } = get_nofile_limits()?; let target_limit = if let Some(nofile) = nofile { if nofile == 0 { return Ok(()); // '--rlimit-nofile=0' leaves the resource limit unchanged } nofile } else { if DEFAULT_NOFILE <= rlim_cur { return Ok(()); // the user has already setup the soft limit higher than the target } cmp::min(DEFAULT_NOFILE, max_nofile) }; if target_limit > max_nofile { return Err(format!("It cannot be increased above {max_nofile}")); } if let Err(error) = setup_rlimit_nofile_to(target_limit) { if nofile.is_some() { // Error attempting to setup user-supplied value return Err(error); } else { warn!( "Failure when trying to set the limit to {}, \ the hard limit ({}) of open file descriptors is used instead.", target_limit, rlim_max ); setup_rlimit_nofile_to(rlim_max).map_err(|error| { format!("Cannot increase the soft limit to the hard limit: {error}") })? } } Ok(()) } virtiofsd-1.10.0/src/macros.rs000064400000000000000000000016011046102023000143330ustar 00000000000000// Copyright 2022 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. macro_rules! enum_value { ( $(#[$meta:meta])* $vis:vis enum $enum:ident: $T:tt { $( $(#[$variant_meta:meta])* $variant:ident $(= $val:expr)?, )* } ) => { #[repr($T)] $(#[$meta])* $vis enum $enum { $($(#[$variant_meta])* $variant $(= $val)?,)* } impl std::convert::TryFrom<$T> for $enum { type Error = (); fn try_from(v: $T) -> Result { match v { $(v if v == $enum::$variant as $T => Ok($enum::$variant),)* _ => Err(()), } } } } } pub(crate) use enum_value; virtiofsd-1.10.0/src/main.rs000064400000000000000000001207551046102023000140070ustar 00000000000000// Copyright 2019 Intel Corporation. All Rights Reserved. // // SPDX-License-Identifier: (Apache-2.0 AND BSD-3-Clause) use futures::executor::{ThreadPool, ThreadPoolBuilder}; use libc::EFD_NONBLOCK; use log::*; use passthrough::xattrmap::XattrMap; use std::collections::HashSet; use std::convert::{self, TryFrom, TryInto}; use std::ffi::CString; use std::os::unix::io::{FromRawFd, RawFd}; use std::path::Path; use std::str::FromStr; use std::sync::{Arc, RwLock}; use std::time::Duration; use std::{env, error, fmt, io, process}; use virtiofsd::idmap::{GidMap, UidMap}; use clap::{CommandFactory, Parser}; use vhost::vhost_user::message::*; use vhost::vhost_user::Error::Disconnected; use vhost::vhost_user::{Backend, Listener}; use vhost_user_backend::Error::HandleRequest; use vhost_user_backend::{VhostUserBackend, VhostUserDaemon, VringMutex, VringState, VringT}; use virtio_bindings::bindings::virtio_config::*; use virtio_bindings::bindings::virtio_ring::{ VIRTIO_RING_F_EVENT_IDX, VIRTIO_RING_F_INDIRECT_DESC, }; use virtio_queue::{DescriptorChain, QueueOwnedT}; use virtiofsd::descriptor_utils::{Error as VufDescriptorError, Reader, Writer}; use virtiofsd::filesystem::FileSystem; use virtiofsd::passthrough::{self, CachePolicy, InodeFileHandlesMode, PassthroughFs}; use virtiofsd::sandbox::{Sandbox, SandboxMode}; use virtiofsd::seccomp::{enable_seccomp, SeccompAction}; use virtiofsd::server::Server; use virtiofsd::util::write_pid_file; use virtiofsd::{limits, oslib, Error as VhostUserFsError}; use vm_memory::{ ByteValued, GuestAddressSpace, GuestMemoryAtomic, GuestMemoryLoadGuard, GuestMemoryMmap, Le32, }; use vmm_sys_util::epoll::EventSet; use vmm_sys_util::eventfd::EventFd; const QUEUE_SIZE: usize = 1024; // The spec allows for multiple request queues. We currently only support one. const REQUEST_QUEUES: u32 = 1; // In addition to the request queue there is one high-prio queue. // Since VIRTIO_FS_F_NOTIFICATION is not advertised we do not have a // notification queue. const NUM_QUEUES: usize = REQUEST_QUEUES as usize + 1; // The guest queued an available buffer for the high priority queue. const HIPRIO_QUEUE_EVENT: u16 = 0; // The guest queued an available buffer for the request queue. const REQ_QUEUE_EVENT: u16 = 1; const MAX_TAG_LEN: usize = 36; type Result = std::result::Result; type VhostUserBackendResult = std::result::Result; #[derive(Debug)] enum Error { /// Failed to create kill eventfd. CreateKillEventFd(io::Error), /// Failed to create thread pool. CreateThreadPool(io::Error), /// Failed to handle event other than input event. HandleEventNotEpollIn, /// Failed to handle unknown event. HandleEventUnknownEvent, /// Iterating through the queue failed. IterateQueue, /// No memory configured. NoMemoryConfigured, /// Processing queue failed. ProcessQueue(VhostUserFsError), /// Creating a queue reader failed. QueueReader(VufDescriptorError), /// Creating a queue writer failed. QueueWriter(VufDescriptorError), /// The unshare(CLONE_FS) call failed. UnshareCloneFs(io::Error), /// Invalid tag name InvalidTag, } impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use self::Error::UnshareCloneFs; match self { UnshareCloneFs(error) => { write!( f, "The unshare(CLONE_FS) syscall failed with '{error}'. \ If running in a container please check that the container \ runtime seccomp policy allows unshare." ) } Self::InvalidTag => write!( f, "The tag may not be empty or longer than {MAX_TAG_LEN} bytes (encoded as UTF-8)." ), _ => write!(f, "{self:?}"), } } } impl error::Error for Error {} impl convert::From for io::Error { fn from(e: Error) -> Self { io::Error::new(io::ErrorKind::Other, e) } } struct VhostUserFsThread { mem: Option>, kill_evt: EventFd, server: Arc>, // handle request from backend to frontend vu_req: Option, event_idx: bool, pool: Option, } impl Clone for VhostUserFsThread { fn clone(&self) -> Self { VhostUserFsThread { mem: self.mem.clone(), kill_evt: self.kill_evt.try_clone().unwrap(), server: self.server.clone(), vu_req: self.vu_req.clone(), event_idx: self.event_idx, pool: self.pool.clone(), } } } impl VhostUserFsThread { fn new(fs: F, thread_pool_size: usize) -> Result { let pool = if thread_pool_size > 0 { // Test that unshare(CLONE_FS) works, it will be called for each thread. // It's an unprivileged system call but some Docker/Moby versions are // known to reject it via seccomp when CAP_SYS_ADMIN is not given. // // Note that the program is single-threaded here so this syscall has no // visible effect and is safe to make. let ret = unsafe { libc::unshare(libc::CLONE_FS) }; if ret == -1 { return Err(Error::UnshareCloneFs(std::io::Error::last_os_error())); } Some( ThreadPoolBuilder::new() .after_start(|_| { // unshare FS for xattr operation let ret = unsafe { libc::unshare(libc::CLONE_FS) }; assert_eq!(ret, 0); // Should not fail }) .pool_size(thread_pool_size) .create() .map_err(Error::CreateThreadPool)?, ) } else { None }; Ok(VhostUserFsThread { mem: None, kill_evt: EventFd::new(EFD_NONBLOCK).map_err(Error::CreateKillEventFd)?, server: Arc::new(Server::new(fs)), vu_req: None, event_idx: false, pool, }) } fn return_descriptor( vring_state: &mut VringState, head_index: u16, event_idx: bool, len: usize, ) { let used_len: u32 = match len.try_into() { Ok(l) => l, Err(_) => panic!("Invalid used length, can't return used descritors to the ring"), }; if vring_state.add_used(head_index, used_len).is_err() { warn!("Couldn't return used descriptors to the ring"); } if event_idx { match vring_state.needs_notification() { Err(_) => { warn!("Couldn't check if queue needs to be notified"); vring_state.signal_used_queue().unwrap(); } Ok(needs_notification) => { if needs_notification { vring_state.signal_used_queue().unwrap(); } } } } else { vring_state.signal_used_queue().unwrap(); } } fn process_queue_pool(&self, vring: VringMutex) -> Result { let mut used_any = false; let atomic_mem = match &self.mem { Some(m) => m, None => return Err(Error::NoMemoryConfigured), }; while let Some(avail_desc) = vring .get_mut() .get_queue_mut() .iter(atomic_mem.memory()) .map_err(|_| Error::IterateQueue)? .next() { used_any = true; // Prepare a set of objects that can be moved to the worker thread. let atomic_mem = atomic_mem.clone(); let server = self.server.clone(); let mut vu_req = self.vu_req.clone(); let event_idx = self.event_idx; let worker_vring = vring.clone(); let worker_desc = avail_desc.clone(); self.pool.as_ref().unwrap().spawn_ok(async move { let mem = atomic_mem.memory(); let head_index = worker_desc.head_index(); let reader = Reader::new(&mem, worker_desc.clone()) .map_err(Error::QueueReader) .unwrap(); let writer = Writer::new(&mem, worker_desc.clone()) .map_err(Error::QueueWriter) .unwrap(); let len = server .handle_message(reader, writer, vu_req.as_mut()) .map_err(Error::ProcessQueue) .unwrap(); Self::return_descriptor(&mut worker_vring.get_mut(), head_index, event_idx, len); }); } Ok(used_any) } fn process_queue_serial(&self, vring_state: &mut VringState) -> Result { let mut used_any = false; let mem = match &self.mem { Some(m) => m.memory(), None => return Err(Error::NoMemoryConfigured), }; let mut vu_req = self.vu_req.clone(); let avail_chains: Vec>> = vring_state .get_queue_mut() .iter(mem.clone()) .map_err(|_| Error::IterateQueue)? .collect(); for chain in avail_chains { used_any = true; let head_index = chain.head_index(); let reader = Reader::new(&mem, chain.clone()) .map_err(Error::QueueReader) .unwrap(); let writer = Writer::new(&mem, chain.clone()) .map_err(Error::QueueWriter) .unwrap(); let len = self .server .handle_message(reader, writer, vu_req.as_mut()) .map_err(Error::ProcessQueue) .unwrap(); Self::return_descriptor(vring_state, head_index, self.event_idx, len); } Ok(used_any) } fn handle_event_pool( &self, device_event: u16, vrings: &[VringMutex], ) -> VhostUserBackendResult<()> { let idx = match device_event { HIPRIO_QUEUE_EVENT => { debug!("HIPRIO_QUEUE_EVENT"); 0 } REQ_QUEUE_EVENT => { debug!("QUEUE_EVENT"); 1 } _ => return Err(Error::HandleEventUnknownEvent.into()), }; if self.event_idx { // vm-virtio's Queue implementation only checks avail_index // once, so to properly support EVENT_IDX we need to keep // calling process_queue() until it stops finding new // requests on the queue. loop { vrings[idx].disable_notification().unwrap(); self.process_queue_pool(vrings[idx].clone())?; if !vrings[idx].enable_notification().unwrap() { break; } } } else { // Without EVENT_IDX, a single call is enough. self.process_queue_pool(vrings[idx].clone())?; } Ok(()) } fn handle_event_serial( &self, device_event: u16, vrings: &[VringMutex], ) -> VhostUserBackendResult<()> { let mut vring_state = match device_event { HIPRIO_QUEUE_EVENT => { debug!("HIPRIO_QUEUE_EVENT"); vrings[0].get_mut() } REQ_QUEUE_EVENT => { debug!("QUEUE_EVENT"); vrings[1].get_mut() } _ => return Err(Error::HandleEventUnknownEvent.into()), }; if self.event_idx { // vm-virtio's Queue implementation only checks avail_index // once, so to properly support EVENT_IDX we need to keep // calling process_queue() until it stops finding new // requests on the queue. loop { vring_state.disable_notification().unwrap(); self.process_queue_serial(&mut vring_state)?; if !vring_state.enable_notification().unwrap() { break; } } } else { // Without EVENT_IDX, a single call is enough. self.process_queue_serial(&mut vring_state)?; } Ok(()) } } #[repr(C)] #[derive(Clone, Copy)] struct VirtioFsConfig { tag: [u8; MAX_TAG_LEN], num_request_queues: Le32, } // vm-memory needs a Default implementation even though these values are never // used anywhere... impl Default for VirtioFsConfig { fn default() -> Self { Self { tag: [0; MAX_TAG_LEN], num_request_queues: Le32::default(), } } } unsafe impl ByteValued for VirtioFsConfig {} struct VhostUserFsBackend { thread: RwLock>, tag: Option, } impl VhostUserFsBackend { fn new(fs: F, thread_pool_size: usize, tag: Option) -> Result { let thread = RwLock::new(VhostUserFsThread::new(fs, thread_pool_size)?); Ok(VhostUserFsBackend { thread, tag }) } } impl VhostUserBackend for VhostUserFsBackend { type Bitmap = (); type Vring = VringMutex; fn num_queues(&self) -> usize { NUM_QUEUES } fn max_queue_size(&self) -> usize { QUEUE_SIZE } fn features(&self) -> u64 { 1 << VIRTIO_F_VERSION_1 | 1 << VIRTIO_RING_F_INDIRECT_DESC | 1 << VIRTIO_RING_F_EVENT_IDX | VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits() } fn protocol_features(&self) -> VhostUserProtocolFeatures { let mut protocol_features = VhostUserProtocolFeatures::MQ | VhostUserProtocolFeatures::BACKEND_REQ | VhostUserProtocolFeatures::BACKEND_SEND_FD | VhostUserProtocolFeatures::REPLY_ACK | VhostUserProtocolFeatures::CONFIGURE_MEM_SLOTS; if self.tag.is_some() { protocol_features |= VhostUserProtocolFeatures::CONFIG; } protocol_features } fn get_config(&self, offset: u32, size: u32) -> Vec { // virtio spec 1.2, 5.11.4: // The tag is encoded in UTF-8 and padded with NUL bytes if shorter than // the available space. This field is not NUL-terminated if the encoded // bytes take up the entire field. // The length was already checked when parsing the arguments. Hence, we // only assert that everything looks sane and pad with NUL bytes to the // fixed length. let tag = self.tag.as_ref().expect("Did not expect read of config if tag is not set. We do not advertise F_CONFIG in that case!"); assert!(tag.len() <= MAX_TAG_LEN, "too long tag length"); assert!(!tag.is_empty(), "tag should not be empty"); let mut fixed_len_tag = [0; MAX_TAG_LEN]; fixed_len_tag[0..tag.len()].copy_from_slice(tag.as_bytes()); let config = VirtioFsConfig { tag: fixed_len_tag, num_request_queues: Le32::from(REQUEST_QUEUES), }; let offset = offset as usize; let size = size as usize; let mut result: Vec<_> = config .as_slice() .iter() .skip(offset) .take(size) .copied() .collect(); // pad with 0s up to `size` result.resize(size, 0); result } fn set_event_idx(&self, enabled: bool) { self.thread.write().unwrap().event_idx = enabled; } fn update_memory(&self, mem: GuestMemoryAtomic) -> VhostUserBackendResult<()> { self.thread.write().unwrap().mem = Some(mem); Ok(()) } fn handle_event( &self, device_event: u16, evset: EventSet, vrings: &[VringMutex], _thread_id: usize, ) -> VhostUserBackendResult<()> { if evset != EventSet::IN { return Err(Error::HandleEventNotEpollIn.into()); } let thread = self.thread.read().unwrap(); if thread.pool.is_some() { thread.handle_event_pool(device_event, vrings) } else { thread.handle_event_serial(device_event, vrings) } } fn exit_event(&self, _thread_index: usize) -> Option { Some(self.thread.read().unwrap().kill_evt.try_clone().unwrap()) } fn set_backend_req_fd(&self, vu_req: Backend) { self.thread.write().unwrap().vu_req = Some(vu_req); } } fn parse_seccomp(src: &str) -> std::result::Result { Ok(match src { "none" => SeccompAction::Allow, // i.e. no seccomp "kill" => SeccompAction::Kill, "log" => SeccompAction::Log, "trap" => SeccompAction::Trap, _ => return Err("Matching variant not found"), }) } /// On the command line, we want to allow aliases for `InodeFileHandlesMode` values. This enum has /// all values allowed on the command line, and with `From`/`Into`, it can be translated into the /// internally used `InodeFileHandlesMode` enum. #[derive(Debug, Copy, Clone, PartialEq, Eq)] enum InodeFileHandlesCommandLineMode { /// `InodeFileHandlesMode::Never` Never, /// Alias for `InodeFileHandlesMode::Prefer` Fallback, /// `InodeFileHandlesMode::Prefer` Prefer, /// `InodeFileHandlesMode::Mandatory` Mandatory, } impl From for InodeFileHandlesMode { fn from(clm: InodeFileHandlesCommandLineMode) -> Self { match clm { InodeFileHandlesCommandLineMode::Never => InodeFileHandlesMode::Never, InodeFileHandlesCommandLineMode::Fallback => InodeFileHandlesMode::Prefer, InodeFileHandlesCommandLineMode::Prefer => InodeFileHandlesMode::Prefer, InodeFileHandlesCommandLineMode::Mandatory => InodeFileHandlesMode::Mandatory, } } } impl FromStr for InodeFileHandlesCommandLineMode { type Err = &'static str; fn from_str(s: &str) -> std::result::Result { match s { "never" => Ok(InodeFileHandlesCommandLineMode::Never), "fallback" => Ok(InodeFileHandlesCommandLineMode::Fallback), "prefer" => Ok(InodeFileHandlesCommandLineMode::Prefer), "mandatory" => Ok(InodeFileHandlesCommandLineMode::Mandatory), _ => Err("invalid inode file handles mode"), } } } fn parse_tag(tag: &str) -> Result { if !tag.is_empty() && tag.len() <= MAX_TAG_LEN { Ok(tag.into()) } else { Err(Error::InvalidTag) } } #[derive(Clone, Debug, Parser)] #[command( name = "virtiofsd", about = "Launch a virtiofsd backend.", version, args_override_self = true )] struct Opt { /// Shared directory path #[arg(long)] shared_dir: Option, /// The tag that the virtio device advertises /// /// Setting this option will enable advertising of /// VHOST_USER_PROTOCOL_F_CONFIG. However, the vhost-user frontend of your /// hypervisor may not negotiate this feature and (or) ignore this value. /// Notably, QEMU currently (as of 8.1) ignores the CONFIG feature. QEMU /// versions from 7.1 to 8.0 will crash while attempting to log a warning /// about not supporting the feature. #[arg(long, value_parser = parse_tag)] tag: Option, /// vhost-user socket path [deprecated] #[arg(long, required_unless_present_any = &["fd", "socket_path", "print_capabilities"])] socket: Option, /// vhost-user socket path #[arg(long = "socket-path", required_unless_present_any = &["fd", "socket", "print_capabilities"])] socket_path: Option, /// Name of group for the vhost-user socket #[arg(long = "socket-group", conflicts_with_all = &["fd", "print_capabilities"])] socket_group: Option, /// File descriptor for the listening socket #[arg(long, required_unless_present_any = &["socket", "socket_path", "print_capabilities"], conflicts_with_all = &["socket_path", "socket"])] fd: Option, /// Maximum thread pool size. A value of "0" disables the pool #[arg(long, default_value = "0")] thread_pool_size: usize, /// Enable support for extended attributes #[arg(long)] xattr: bool, /// Enable support for posix ACLs (implies --xattr) #[arg(long)] posix_acl: bool, /// Add custom rules for translating extended attributes between host and guest /// (e.g. :map::user.virtiofs.:) #[arg(long, value_parser = |s: &_| XattrMap::try_from(s))] xattrmap: Option, /// Sandbox mechanism to isolate the daemon process (namespace, chroot, none) #[arg(long, default_value = "namespace")] sandbox: SandboxMode, /// Action to take when seccomp finds a not allowed syscall (none, kill, log, trap) #[arg(long, value_parser = parse_seccomp, default_value = "kill")] seccomp: SeccompAction, /// Tell the guest which directories are mount points [default] #[arg(long)] announce_submounts: bool, /// Do not tell the guest which directories are mount points #[arg(long, overrides_with("announce_submounts"))] no_announce_submounts: bool, /// When to use file handles to reference inodes instead of O_PATH file descriptors (never, /// prefer, mandatory) /// /// - never: Never use file handles, always use O_PATH file descriptors. /// /// - prefer: Attempt to generate file handles, but fall back to O_PATH file descriptors where /// the underlying filesystem does not support file handles. Useful when there are various /// different filesystems under the shared directory and some of them do not support file /// handles. ("fallback" is a deprecated alias for "prefer".) /// /// - mandatory: Always use file handles, never fall back to O_PATH file descriptors. /// /// Using file handles reduces the number of file descriptors virtiofsd keeps open, which is /// not only helpful with resources, but may also be important in cases where virtiofsd should /// only have file descriptors open for files that are open in the guest, e.g. to get around /// bad interactions with NFS's silly renaming. #[arg(long, require_equals = true, default_value = "never")] inode_file_handles: InodeFileHandlesCommandLineMode, /// The caching policy the file system should use (auto, always, never, metadata) #[arg(long, default_value = "auto")] cache: CachePolicy, /// Disable support for READDIRPLUS operations #[arg(long)] no_readdirplus: bool, /// Enable writeback cache #[arg(long)] writeback: bool, /// Honor the O_DIRECT flag passed down by guest applications #[arg(long)] allow_direct_io: bool, /// Print vhost-user.json backend program capabilities and exit #[arg(long = "print-capabilities")] print_capabilities: bool, /// Modify the list of capabilities, e.g., --modcaps=+sys_admin:-chown #[arg(long)] modcaps: Option, /// Log level (error, warn, info, debug, trace, off) #[arg(long = "log-level", default_value = "info")] log_level: LevelFilter, /// Log to syslog [default: stderr] #[arg(long)] syslog: bool, /// Set maximum number of file descriptors (0 leaves rlimit unchanged) /// [default: min(1000000, '/proc/sys/fs/nr_open')] #[arg(long = "rlimit-nofile")] rlimit_nofile: Option, /// Options in a format compatible with the legacy implementation [deprecated] #[arg(short = 'o')] compat_options: Option>, /// Set log level to "debug" [deprecated] #[arg(short = 'd')] compat_debug: bool, /// Disable KILLPRIV V2 support [default] #[arg(long)] _no_killpriv_v2: bool, /// Enable KILLPRIV V2 support #[arg(long, overrides_with("_no_killpriv_v2"))] killpriv_v2: bool, /// Compatibility option that has no effect [deprecated] #[arg(short = 'f')] compat_foreground: bool, /// Enable security label support. Expects SELinux xattr on file creation /// from client and stores it in the newly created file. #[arg(long = "security-label")] security_label: bool, /// Map a range of UIDs from the host into the namespace, given as /// :namespace_uid:host_uid:count: /// /// For example, :0:100000:65536: will map the 65536 host UIDs [100000, 165535] /// into the namespace as [0, 65535]. #[arg(long)] uid_map: Option, /// Map a range of GIDs from the host into the namespace, given as /// :namespace_gid:host_gid:count: /// /// For example, :0:100000:65536: will map the 65536 host GIDs [100000, 165535] /// into the namespace as [0, 65535]. #[arg(long)] gid_map: Option, /// Preserve O_NOATIME behavior, otherwise automatically clean up O_NOATIME flag to prevent /// potential permission errors when running in unprivileged mode (e.g., when accessing files /// without having ownership/capability to use O_NOATIME). #[arg(long = "preserve-noatime")] preserve_noatime: bool, } fn parse_compat(opt: Opt) -> Opt { use clap::error::ErrorKind; fn value_error(arg: &str, value: &str) -> ! { ::command() .error( ErrorKind::InvalidValue, format!("Invalid compat value '{value}' for '-o {arg}'"), ) .exit() } fn argument_error(arg: &str) -> ! { ::command() .error( ErrorKind::UnknownArgument, format!("Invalid compat argument '-o {arg}'"), ) .exit() } fn parse_tuple(opt: &mut Opt, tuple: &str) { match tuple.split('=').collect::>()[..] { ["xattrmap", value] => { opt.xattrmap = Some( XattrMap::try_from(value).unwrap_or_else(|_| value_error("xattrmap", value)), ) } ["cache", value] => match value { "auto" => opt.cache = CachePolicy::Auto, "always" => opt.cache = CachePolicy::Always, "none" => opt.cache = CachePolicy::Never, "metadata" => opt.cache = CachePolicy::Metadata, _ => value_error("cache", value), }, ["loglevel", value] => match value { "debug" => opt.log_level = LevelFilter::Debug, "info" => opt.log_level = LevelFilter::Info, "warn" => opt.log_level = LevelFilter::Warn, "err" => opt.log_level = LevelFilter::Error, _ => value_error("loglevel", value), }, ["sandbox", value] => match value { "namespace" => opt.sandbox = SandboxMode::Namespace, "chroot" => opt.sandbox = SandboxMode::Chroot, _ => value_error("sandbox", value), }, ["source", value] => opt.shared_dir = Some(value.to_string()), ["modcaps", value] => opt.modcaps = Some(value.to_string()), _ => argument_error(tuple), } } fn parse_single(opt: &mut Opt, option: &str) { match option { "xattr" => opt.xattr = true, "no_xattr" => opt.xattr = false, "readdirplus" => opt.no_readdirplus = false, "no_readdirplus" => opt.no_readdirplus = true, "writeback" => opt.writeback = true, "no_writeback" => opt.writeback = false, "allow_direct_io" => opt.allow_direct_io = true, "no_allow_direct_io" => opt.allow_direct_io = false, "announce_submounts" => opt.announce_submounts = true, "killpriv_v2" => opt.killpriv_v2 = true, "no_killpriv_v2" => opt.killpriv_v2 = false, "posix_acl" => opt.posix_acl = true, "no_posix_acl" => opt.posix_acl = false, "security_label" => opt.security_label = true, "no_security_label" => opt.security_label = false, "no_posix_lock" | "no_flock" => (), _ => argument_error(option), } } let mut clean_opt = opt.clone(); if let Some(compat_options) = opt.compat_options.as_ref() { for line in compat_options { for option in line.to_string().split(',') { if option.contains('=') { parse_tuple(&mut clean_opt, option); } else { parse_single(&mut clean_opt, option); } } } } clean_opt } fn print_capabilities() { println!("{{"); println!(" \"type\": \"fs\""); println!("}}"); } fn set_default_logger(log_level: LevelFilter) { if env::var("RUST_LOG").is_err() { env::set_var("RUST_LOG", log_level.to_string()); } env_logger::init(); } fn initialize_logging(opt: &Opt) { let log_level = if opt.compat_debug { LevelFilter::Debug } else { opt.log_level }; if opt.syslog { if let Err(e) = syslog::init(syslog::Facility::LOG_USER, log_level, None) { set_default_logger(log_level); warn!("can't enable syslog: {}", e); } } else { set_default_logger(log_level); } } fn set_signal_handlers() { use vmm_sys_util::signal; extern "C" fn handle_signal(_: libc::c_int, _: *mut libc::siginfo_t, _: *mut libc::c_void) { unsafe { libc::_exit(1) }; } let signals = vec![libc::SIGHUP, libc::SIGTERM]; for s in signals { if let Err(e) = signal::register_signal_handler(s, handle_signal) { error!("Setting signal handlers: {}", e); process::exit(1); } } } fn parse_modcaps( default_caps: Vec<&str>, modcaps: Option, ) -> (HashSet, HashSet) { let mut required_caps: HashSet = default_caps.iter().map(|&s| s.into()).collect(); let mut disabled_caps = HashSet::new(); if let Some(modcaps) = modcaps { for modcap in modcaps.split(':').map(str::to_string) { if modcap.is_empty() { error!("empty modcap found: expected (+|-)capability:..."); process::exit(1); } let (action, cap_name) = modcap.split_at(1); let cap_name = cap_name.to_uppercase(); if !matches!(action, "+" | "-") { error!( "invalid modcap action: expecting '+'|'-' but found '{}'", action ); process::exit(1); } if let Err(error) = capng::name_to_capability(&cap_name) { error!("invalid capability '{}': {}", &cap_name, error); process::exit(1); } match action { "+" => { disabled_caps.remove(&cap_name); required_caps.insert(cap_name); } "-" => { required_caps.remove(&cap_name); disabled_caps.insert(cap_name); } _ => unreachable!(), } } } (required_caps, disabled_caps) } fn drop_capabilities(inode_file_handles: InodeFileHandlesMode, modcaps: Option) { let default_caps = vec![ "CHOWN", "DAC_OVERRIDE", "FOWNER", "FSETID", "SETGID", "SETUID", "MKNOD", "SETFCAP", ]; let (mut required_caps, disabled_caps) = parse_modcaps(default_caps, modcaps); if inode_file_handles != InodeFileHandlesMode::Never { let required_cap = "DAC_READ_SEARCH".to_owned(); if disabled_caps.contains(&required_cap) { error!( "can't disable {} when using --inode-file-handles={:?}", &required_cap, inode_file_handles ); process::exit(1); } required_caps.insert(required_cap); } capng::clear(capng::Set::BOTH); // Configure the required set of capabilities for the child, and leave the // parent with none. if let Err(e) = capng::updatev( capng::Action::ADD, capng::Type::PERMITTED | capng::Type::EFFECTIVE, required_caps.iter().map(String::as_str).collect(), ) { error!("can't set up the child capabilities: {}", e); process::exit(1); } if let Err(e) = capng::apply(capng::Set::BOTH) { error!("can't apply the child capabilities: {}", e); process::exit(1); } } fn has_noatime_capability() -> bool { // We may not have all permissions/capabilities to use O_NOATIME with all the exported files if // we are running as unprivileged user and without any sandbox (e.g., --sandbox=none). // // Provide this helper function to check this particular case. let uid = unsafe { libc::geteuid() }; let cap = capng::name_to_capability("FOWNER").unwrap_or_else(|err| { error!("could not get capability FOWNER: {}", err); process::exit(1); }); uid == 0 || capng::have_capability(capng::Type::EFFECTIVE, cap) } fn main() { let opt = parse_compat(Opt::parse()); // Enable killpriv_v2 only if user explicitly asked for it by using // --killpriv-v2 or -o killpriv_v2. Otherwise disable it by default. let killpriv_v2 = opt.killpriv_v2; // Disable announce submounts if the user asked for it let announce_submounts = !opt.no_announce_submounts; if opt.print_capabilities { print_capabilities(); return; } initialize_logging(&opt); set_signal_handlers(); let shared_dir = match opt.shared_dir.as_ref() { Some(s) => s, None => { error!("missing \"--shared-dir\" or \"-o source\" option"); process::exit(1); } }; if opt.compat_foreground { warn!("Use of deprecated flag '-f': This flag has no effect, please remove it"); } if opt.compat_debug { warn!("Use of deprecated flag '-d': Please use the '--log-level debug' option instead"); } if opt.compat_options.is_some() { warn!("Use of deprecated option format '-o': Please specify options without it (e.g., '--cache auto' instead of '-o cache=auto')"); } if opt.inode_file_handles == InodeFileHandlesCommandLineMode::Fallback { warn!("Use of deprecated value 'fallback' for '--inode-file-handles': Please use 'prefer' instead"); } let xattrmap = opt.xattrmap.clone(); let xattr = xattrmap.is_some() || opt.posix_acl || opt.xattr; let thread_pool_size = opt.thread_pool_size; let readdirplus = match opt.cache { CachePolicy::Never => false, _ => !opt.no_readdirplus, }; let timeout = match opt.cache { CachePolicy::Never => Duration::from_secs(0), CachePolicy::Metadata => Duration::from_secs(86400), CachePolicy::Auto => Duration::from_secs(1), CachePolicy::Always => Duration::from_secs(86400), }; let umask = if opt.socket_group.is_some() { libc::S_IROTH | libc::S_IWOTH | libc::S_IXOTH } else { libc::S_IRGRP | libc::S_IWGRP | libc::S_IXGRP | libc::S_IROTH | libc::S_IWOTH | libc::S_IXOTH }; // We need to keep _pid_file around because it maintains a lock on the pid file // that prevents another daemon from using the same pid file. let (listener, socket_path, _pid_file) = match opt.fd.as_ref() { Some(fd) => unsafe { (Listener::from_raw_fd(*fd), None, None) }, None => { // Set umask to ensure the socket is created with the right permissions let _umask_guard = oslib::ScopedUmask::new(umask); let socket = opt.socket_path.as_ref().unwrap_or_else(|| { warn!("use of deprecated parameter '--socket': Please use the '--socket-path' option instead"); opt.socket.as_ref().unwrap() // safe to unwrap because clap ensures either --socket or --socket-path are passed }); let pid_file_name = socket.to_owned() + ".pid"; let pid_file_path = Path::new(pid_file_name.as_str()); let pid_file = write_pid_file(pid_file_path).unwrap_or_else(|error| { error!("Error creating pid file '{}': {}", pid_file_name, error); process::exit(1); }); let listener = Listener::new(socket, true).unwrap_or_else(|error| { error!("Error creating listener: {}", error); process::exit(1); }); (listener, Some(socket.clone()), Some(pid_file)) } }; if let Some(group_name) = opt.socket_group { let c_name = CString::new(group_name).expect("invalid group name"); let group = unsafe { libc::getgrnam(c_name.as_ptr()) }; if group.is_null() { error!("Couldn't resolve the group name specified for the socket path"); process::exit(1); } // safe to unwrap because clap ensures --socket-group can't be specified alongside --fd let c_socket_path = CString::new(socket_path.unwrap()).expect("invalid socket path"); let ret = unsafe { libc::chown(c_socket_path.as_ptr(), u32::MAX, (*group).gr_gid) }; if ret != 0 { error!( "Couldn't set up the group for the socket path: {}", std::io::Error::last_os_error() ); process::exit(1); } } limits::setup_rlimit_nofile(opt.rlimit_nofile).unwrap_or_else(|error| { error!("Error increasing number of open files: {}", error); process::exit(1) }); let mut sandbox = Sandbox::new( shared_dir.to_string(), opt.sandbox, opt.uid_map, opt.gid_map, ) .unwrap_or_else(|error| { error!("Error creating sandbox: {}", error); process::exit(1) }); // Enter the sandbox, from this point the process will be isolated (or not) // as chosen in '--sandbox'. sandbox.enter().unwrap_or_else(|error| { error!("Error entering sandbox: {}", error); process::exit(1) }); let fs_cfg = passthrough::Config { entry_timeout: timeout, attr_timeout: timeout, cache_policy: opt.cache, root_dir: sandbox.get_root_dir(), mountinfo_prefix: sandbox.get_mountinfo_prefix(), xattr, xattrmap, proc_sfd_rawfd: sandbox.get_proc_self_fd(), proc_mountinfo_rawfd: sandbox.get_mountinfo_fd(), announce_submounts, inode_file_handles: opt.inode_file_handles.into(), readdirplus, writeback: opt.writeback, allow_direct_io: opt.allow_direct_io, killpriv_v2, security_label: opt.security_label, posix_acl: opt.posix_acl, clean_noatime: !opt.preserve_noatime && !has_noatime_capability(), ..Default::default() }; // Must happen before we start the thread pool match opt.seccomp { SeccompAction::Allow => {} _ => enable_seccomp(opt.seccomp, opt.syslog).unwrap(), } // We don't modify the capabilities if the user call us without // any sandbox (i.e. --sandbox=none) as unprivileged user let uid = unsafe { libc::geteuid() }; if uid == 0 { drop_capabilities(fs_cfg.inode_file_handles, opt.modcaps); } let fs = match PassthroughFs::new(fs_cfg) { Ok(fs) => fs, Err(e) => { error!( "Failed to create internal filesystem representation: {:?}", e ); process::exit(1); } }; let fs_backend = Arc::new( VhostUserFsBackend::new(fs, thread_pool_size, opt.tag).unwrap_or_else(|error| { error!("Error creating vhost-user backend: {}", error); process::exit(1) }), ); let mut daemon = VhostUserDaemon::new( String::from("virtiofsd-backend"), fs_backend.clone(), GuestMemoryAtomic::new(GuestMemoryMmap::new()), ) .unwrap(); info!("Waiting for vhost-user socket connection..."); if let Err(e) = daemon.start(listener) { error!("Failed to start daemon: {:?}", e); process::exit(1); } info!("Client connected, servicing requests"); if let Err(e) = daemon.wait() { match e { HandleRequest(Disconnected) => info!("Client disconnected, shutting down"), _ => error!("Waiting for daemon failed: {:?}", e), } } let kill_evt = fs_backend .thread .read() .unwrap() .kill_evt .try_clone() .unwrap(); if let Err(e) = kill_evt.write(1) { error!("Error shutting down worker thread: {:?}", e) } } virtiofsd-1.10.0/src/oslib.rs000064400000000000000000000363651046102023000141760ustar 00000000000000// SPDX-License-Identifier: BSD-3-Clause use bitflags::bitflags; use std::ffi::{CStr, CString}; use std::fs::File; use std::io::{self, Error, Result}; use std::os::unix::io::{AsRawFd, BorrowedFd, RawFd}; use std::os::unix::prelude::FromRawFd; // A helper function that check the return value of a C function call // and wraps it in a `Result` type, returning the `errno` code as `Err`. fn check_retval + PartialEq>(t: T) -> Result { if t == T::from(-1_i8) { Err(Error::last_os_error()) } else { Ok(t) } } /// Simple object to collect basic facts about the OS, /// such as available syscalls. pub struct OsFacts { pub has_openat2: bool, } #[allow(clippy::new_without_default)] impl OsFacts { /// This object should only be constructed using new. #[must_use] pub fn new() -> Self { // Checking for `openat2()` since it first appeared in Linux 5.6. // SAFETY: all-zero byte-pattern is a valid `libc::open_how` let how: libc::open_how = unsafe { std::mem::zeroed() }; let cwd = CString::new(".").unwrap(); // SAFETY: `cwd.as_ptr()` points to a valid NUL-terminated string, // and the `how` pointer is a valid pointer to an `open_how` struct. let fd = unsafe { libc::syscall( libc::SYS_openat2, libc::AT_FDCWD, cwd.as_ptr(), std::ptr::addr_of!(how), std::mem::size_of::(), ) }; let has_openat2 = fd >= 0; if has_openat2 { // SAFETY: `fd` is an open file descriptor unsafe { libc::close(fd as libc::c_int); } } Self { has_openat2 } } } /// Safe wrapper for `mount(2)` /// /// # Errors /// /// Will return `Err(errno)` if `mount(2)` fails. /// Each filesystem type may have its own special errors and its own special behavior, /// see `mount(2)` and the linux source kernel for details. /// /// # Panics /// /// This function panics if the strings `source`, `target` or `fstype` contain an internal 0 byte. pub fn mount(source: Option<&str>, target: &str, fstype: Option<&str>, flags: u64) -> Result<()> { let source = CString::new(source.unwrap_or("")).unwrap(); let source = source.as_ptr(); let target = CString::new(target).unwrap(); let target = target.as_ptr(); let fstype = CString::new(fstype.unwrap_or("")).unwrap(); let fstype = fstype.as_ptr(); // Safety: `source`, `target` or `fstype` are a valid C string pointers check_retval(unsafe { libc::mount(source, target, fstype, flags, std::ptr::null()) })?; Ok(()) } /// Safe wrapper for `umount2(2)` /// /// # Errors /// /// Will return `Err(errno)` if `umount2(2)` fails. /// Each filesystem type may have its own special errors and its own special behavior, /// see `umount2(2)` and the linux source kernel for details. /// /// # Panics /// /// This function panics if the strings `target` contains an internal 0 byte. pub fn umount2(target: &str, flags: i32) -> Result<()> { let target = CString::new(target).unwrap(); let target = target.as_ptr(); // Safety: `target` is a valid C string pointer check_retval(unsafe { libc::umount2(target, flags) })?; Ok(()) } /// Safe wrapper for `fchdir(2)` /// /// # Errors /// /// Will return `Err(errno)` if `fchdir(2)` fails. /// Each filesystem type may have its own special errors, see `fchdir(2)` for details. pub fn fchdir(fd: RawFd) -> Result<()> { check_retval(unsafe { libc::fchdir(fd) })?; Ok(()) } /// Safe wrapper for `umask(2)` pub fn umask(mask: u32) -> u32 { // SAFETY: this call doesn't modify any memory and there is no need // to check the return value because this system call always succeeds. unsafe { libc::umask(mask) } } /// An RAII implementation of a scoped file mode creation mask (umask), it set the /// new umask. When this structure is dropped (falls out of scope), it set the previous /// value of the mask. pub struct ScopedUmask { umask: libc::mode_t, } impl ScopedUmask { pub fn new(new_umask: u32) -> Self { Self { umask: umask(new_umask), } } } impl Drop for ScopedUmask { fn drop(&mut self) { umask(self.umask); } } /// Safe wrapper around `openat(2)`. /// /// # Errors /// /// Will return `Err(errno)` if `openat(2)` fails, /// see `openat(2)` for details. pub fn openat(dir: &impl AsRawFd, pathname: &CStr, flags: i32, mode: Option) -> Result { let mode = u64::from(mode.unwrap_or(0)); // SAFETY: `pathname` points to a valid NUL-terminated string. // However, the caller must ensure that `dir` can provide a valid file descriptor. check_retval(unsafe { libc::openat( dir.as_raw_fd(), pathname.as_ptr(), flags as libc::c_int, mode, ) }) } /// An utility function that uses `openat2(2)` to restrict the how the provided pathname /// is resolved. It uses the following flags: /// - `RESOLVE_IN_ROOT`: Treat the directory referred to by dirfd as the root directory while /// resolving pathname. This has the effect as though virtiofsd had used chroot(2) to modify its /// root directory to dirfd. /// - `RESOLVE_NO_MAGICLINKS`: Disallow all magic-link (i.e., proc(2) link-like files) resolution /// during path resolution. /// /// Additionally, the flags `O_NOFOLLOW` and `O_CLOEXEC` are added. /// /// # Error /// /// Will return `Err(errno)` if `openat2(2)` fails, see the man page for details. /// /// # Safety /// /// The caller must ensure that dirfd is a valid file descriptor. pub fn do_open_relative_to( dir: &impl AsRawFd, pathname: &CStr, flags: i32, mode: Option, ) -> Result { // `openat2(2)` returns an error if `how.mode` contains bits other than those in range 07777, // let's ignore the extra bits to be compatible with `openat(2)`. let mode = u64::from(mode.unwrap_or(0)) & 0o7777; // SAFETY: all-zero byte-pattern represents a valid `libc::open_how` let mut how: libc::open_how = unsafe { std::mem::zeroed() }; how.resolve = libc::RESOLVE_IN_ROOT | libc::RESOLVE_NO_MAGICLINKS; how.flags = flags as u64; how.mode = mode; // SAFETY: `pathname` points to a valid NUL-terminated string, and the `how` pointer is a valid // pointer to an `open_how` struct. However, the caller must ensure that `dir` can provide a // valid file descriptor (this can be changed to BorrowedFd). check_retval(unsafe { libc::syscall( libc::SYS_openat2, dir.as_raw_fd(), pathname.as_ptr(), std::ptr::addr_of!(how), std::mem::size_of::(), ) } as RawFd) } mod filehandle { const MAX_HANDLE_SZ: usize = 128; #[derive(Clone, PartialOrd, Ord, PartialEq, Eq)] #[repr(C)] pub struct CFileHandle { handle_bytes: libc::c_uint, handle_type: libc::c_int, f_handle: [libc::c_char; MAX_HANDLE_SZ], } impl Default for CFileHandle { fn default() -> Self { CFileHandle { handle_bytes: MAX_HANDLE_SZ as libc::c_uint, handle_type: 0, f_handle: [0; MAX_HANDLE_SZ], } } } extern "C" { pub fn name_to_handle_at( dirfd: libc::c_int, pathname: *const libc::c_char, file_handle: *mut CFileHandle, mount_id: *mut libc::c_int, flags: libc::c_int, ) -> libc::c_int; // Technically `file_handle` should be a `mut` pointer, but `open_by_handle_at()` is specified // not to change it, so we can declare it `const`. pub fn open_by_handle_at( mount_fd: libc::c_int, file_handle: *const CFileHandle, flags: libc::c_int, ) -> libc::c_int; } } pub use filehandle::CFileHandle; pub fn name_to_handle_at( dirfd: &impl AsRawFd, pathname: &CStr, file_handle: &mut CFileHandle, mount_id: &mut libc::c_int, flags: libc::c_int, ) -> Result<()> { // SAFETY: `dirfd` is a valid file descriptor, `file_handle` // is a valid reference to `CFileHandle`, and `mount_id` is // valid reference to an `int` check_retval(unsafe { filehandle::name_to_handle_at( dirfd.as_raw_fd(), pathname.as_ptr(), file_handle, mount_id, flags, ) })?; Ok(()) } pub fn open_by_handle_at( mount_fd: &impl AsRawFd, file_handle: &CFileHandle, flags: libc::c_int, ) -> Result { // SAFETY: `mount_fd` is a valid file descriptor and `file_handle` // is a valid reference to `CFileHandle` let fd = check_retval(unsafe { filehandle::open_by_handle_at(mount_fd.as_raw_fd(), file_handle, flags) })?; // SAFETY: `open_by_handle_at()` guarantees `fd` is a valid file descriptor Ok(unsafe { File::from_raw_fd(fd) }) } mod writev { /// musl does not provide a wrapper for the `pwritev2(2)` system call, /// we need to call it using `syscall(2)`. #[cfg(target_env = "gnu")] pub use libc::pwritev2; #[cfg(target_env = "musl")] pub unsafe fn pwritev2( fd: libc::c_int, iov: *const libc::iovec, iovcnt: libc::c_int, offset: libc::off_t, flags: libc::c_int, ) -> libc::ssize_t { // The `pwritev2(2)` syscall expects to receive the 64-bit offset split in // its high and low parts (see `syscall(2)`). On 64-bit architectures we // set `lo_off=offset` and `hi_off=0` (glibc does it), since `hi_off` is cleared, // so we need to make sure of not clear the higher 32 bits of `lo_off`, otherwise // the offset will be 0 on 64-bit architectures. let lo_off = offset as libc::c_long; // warn: do not clear the higher 32 bits let hi_off = (offset as u64).checked_shr(libc::c_long::BITS).unwrap_or(0) as libc::c_long; unsafe { libc::syscall(libc::SYS_pwritev2, fd, iov, iovcnt, lo_off, hi_off, flags) as libc::ssize_t } } } // We cannot use libc::RWF_HIPRI, etc, because these constants are not defined in musl. bitflags! { /// A bitwise OR of zero or more flags passed in as a parameter to the /// write vectored function `writev_at()`. pub struct WritevFlags: i32 { /// High priority write. Allows block-based filesystems to use polling of the device, which /// provides lower latency, but may use additional resources. (Currently, this feature is /// usable only on a file descriptor opened using the O_DIRECT flag.) const RWF_HIPRI = 0x00000001; /// Provide a per-write equivalent of the O_DSYNC open(2) flag. Its effect applies /// only to the data range written by the system call. const RWF_DSYNC = 0x00000002; /// Provide a per-write equivalent of the O_SYNC open(2) flag. Its effect applies only /// to the data range written by the system call. const RWF_SYNC = 0x00000004; /// Provide a per-write equivalent of the O_APPEND open(2) flag. Its effect applies only /// to the data range written by the system call. The offset argument does not affect the /// write operation; the data is always appended to the end of the file. /// However, if the offset argument is -1, the current file offset is updated. const RWF_APPEND = 0x00000010; } } #[cfg(target_env = "gnu")] mod writev_test { // Lets make sure (at compile time) that the WritevFlags don't go out of sync with the libc const _: () = assert!( super::WritevFlags::RWF_HIPRI.bits() == libc::RWF_HIPRI, "invalid RWF_HIPRI value" ); const _: () = assert!( super::WritevFlags::RWF_DSYNC.bits() == libc::RWF_DSYNC, "invalid RWF_DSYNC value" ); const _: () = assert!( super::WritevFlags::RWF_SYNC.bits() == libc::RWF_SYNC, "invalid RWF_SYNC value" ); const _: () = assert!( super::WritevFlags::RWF_APPEND.bits() == libc::RWF_APPEND, "invalid RWF_APPEND value" ); } /// Safe wrapper for `pwritev2(2)` /// /// This system call is similar `pwritev(2)`, but add a new argument, /// flags, which modifies the behavior on a per-call basis. /// Unlike `pwritev(2)`, if the offset argument is -1, then the current file offset /// is used and updated. /// /// # Errors /// /// Will return `Err(errno)` if `pwritev2(2)` fails, see `pwritev2(2)` for details. /// /// # Safety /// /// The caller must ensure that each iovec element is valid (i.e., it has a valid `iov_base` /// pointer and `iov_len`). pub unsafe fn writev_at( fd: BorrowedFd, iovecs: &[libc::iovec], offset: i64, flags: Option, ) -> Result { let flags = flags.unwrap_or(WritevFlags::empty()); // SAFETY: `fd` is a valid filed descriptor, `iov` is a valid pointer // to the iovec slice `ìovecs` of `iovcnt` elements. However, the caller // must ensure that each iovec element has a valid `iov_base` pointer and `iov_len`. let bytes_written = check_retval(unsafe { writev::pwritev2( fd.as_raw_fd(), iovecs.as_ptr(), iovecs.len() as libc::c_int, offset, flags.bits(), ) })?; Ok(bytes_written as usize) } pub struct PipeReader(File); impl io::Read for PipeReader { fn read(&mut self, buf: &mut [u8]) -> io::Result { self.0.read(buf) } } pub struct PipeWriter(File); impl io::Write for PipeWriter { fn write(&mut self, buf: &[u8]) -> io::Result { self.0.write(buf) } fn flush(&mut self) -> io::Result<()> { self.0.flush() } } pub fn pipe() -> io::Result<(PipeReader, PipeWriter)> { let mut fds: [RawFd; 2] = [-1, -1]; let ret = unsafe { libc::pipe2(fds.as_mut_ptr(), libc::O_CLOEXEC) }; if ret == -1 { Err(io::Error::last_os_error()) } else { Ok(( PipeReader(unsafe { File::from_raw_fd(fds[0]) }), PipeWriter(unsafe { File::from_raw_fd(fds[1]) }), )) } } // We want credential changes to be per-thread because otherwise // we might interfere with operations being carried out on other // threads with different uids/gids. However, posix requires that // all threads in a process share the same credentials. To do this // libc uses signals to ensure that when one thread changes its // credentials the other threads do the same thing. // // So instead we invoke the syscall directly in order to get around // this limitation. Another option is to use the setfsuid and // setfsgid systems calls. However since those calls have no way to // return an error, it's preferable to do this instead. /// Set effective user ID pub fn seteffuid(uid: libc::uid_t) -> io::Result<()> { check_retval(unsafe { libc::syscall(libc::SYS_setresuid, -1, uid, -1) })?; Ok(()) } /// Set effective group ID pub fn seteffgid(gid: libc::gid_t) -> io::Result<()> { check_retval(unsafe { libc::syscall(libc::SYS_setresgid, -1, gid, -1) })?; Ok(()) } /// Set supplementary group pub fn setsupgroup(gid: libc::gid_t) -> io::Result<()> { check_retval(unsafe { libc::setgroups(1, &gid) })?; Ok(()) } /// Drop all supplementary groups pub fn dropsupgroups() -> io::Result<()> { check_retval(unsafe { libc::setgroups(0, std::ptr::null()) })?; Ok(()) } virtiofsd-1.10.0/src/passthrough/credentials.rs000064400000000000000000000123011046102023000177120ustar 00000000000000// SPDX-License-Identifier: BSD-3-Clause use crate::oslib; use crate::passthrough::util::einval; use std::io; pub struct UnixCredentials { uid: libc::uid_t, gid: libc::gid_t, sup_gid: Option, keep_capability: bool, } impl UnixCredentials { pub fn new(uid: libc::uid_t, gid: libc::gid_t) -> Self { UnixCredentials { uid, gid, sup_gid: None, keep_capability: false, } } /// Set a supplementary group. Set `supported_extension` to `false` to signal that a /// supplementary group maybe required, but the guest was not able to tell us which, /// so we have to rely on keeping the DAC_OVERRIDE capability. pub fn supplementary_gid(self, supported_extension: bool, sup_gid: Option) -> Self { UnixCredentials { uid: self.uid, gid: self.gid, sup_gid, keep_capability: !supported_extension, } } /// Changes the effective uid/gid of the current thread to `val`. Changes /// the thread's credentials back to root when the returned struct is dropped. pub fn set(self) -> io::Result> { let change_uid = self.uid != 0; let change_gid = self.gid != 0; // We have to change the gid before we change the uid because if we // change the uid first then we lose the capability to change the gid. // However changing back can happen in any order. if let Some(sup_gid) = self.sup_gid { oslib::setsupgroup(sup_gid)?; } if change_gid { oslib::seteffgid(self.gid)?; } if change_uid { oslib::seteffuid(self.uid)?; } if change_uid && self.keep_capability { // Before kernel 6.3, we don't have access to process supplementary groups. // To work around this we can set the `DAC_OVERRIDE` in the effective set. // We are allowed to set the capability because we only change the effective // user ID, so we still have the 'DAC_OVERRIDE' in the permitted set. // After switching back to root the permitted set is copied to the effective set, // so no additional steps are required. if let Err(e) = crate::util::add_cap_to_eff("DAC_OVERRIDE") { warn!("failed to add 'DAC_OVERRIDE' to the effective set of capabilities: {e}"); } } if !change_uid && !change_gid { return Ok(None); } Ok(Some(UnixCredentialsGuard { reset_uid: change_uid, reset_gid: change_gid, drop_sup_gid: self.sup_gid.is_some(), })) } } pub struct UnixCredentialsGuard { reset_uid: bool, reset_gid: bool, drop_sup_gid: bool, } impl Drop for UnixCredentialsGuard { fn drop(&mut self) { if self.reset_uid { oslib::seteffuid(0).unwrap_or_else(|e| { error!("failed to change uid back to root: {e}"); }); } if self.reset_gid { oslib::seteffgid(0).unwrap_or_else(|e| { error!("failed to change gid back to root: {e}"); }); } if self.drop_sup_gid { oslib::dropsupgroups().unwrap_or_else(|e| { error!("failed to drop supplementary groups: {e}"); }); } } } pub struct ScopedCaps { cap: capng::Capability, } impl ScopedCaps { fn new(cap_name: &str) -> io::Result> { use capng::{Action, CUpdate, Set, Type}; let cap = capng::name_to_capability(cap_name).map_err(|_| { let err = io::Error::last_os_error(); error!( "couldn't get the capability id for name {}: {:?}", cap_name, err ); err })?; if capng::have_capability(Type::EFFECTIVE, cap) { let req = vec![CUpdate { action: Action::DROP, cap_type: Type::EFFECTIVE, capability: cap, }]; capng::update(req).map_err(|e| { error!("couldn't drop {} capability: {:?}", cap, e); einval() })?; capng::apply(Set::CAPS).map_err(|e| { error!( "couldn't apply capabilities after dropping {}: {:?}", cap, e ); einval() })?; Ok(Some(Self { cap })) } else { Ok(None) } } } impl Drop for ScopedCaps { fn drop(&mut self) { use capng::{Action, CUpdate, Set, Type}; let req = vec![CUpdate { action: Action::ADD, cap_type: Type::EFFECTIVE, capability: self.cap, }]; if let Err(e) = capng::update(req) { panic!("couldn't restore {} capability: {:?}", self.cap, e); } if let Err(e) = capng::apply(Set::CAPS) { panic!( "couldn't apply capabilities after restoring {}: {:?}", self.cap, e ); } } } pub fn drop_effective_cap(cap_name: &str) -> io::Result> { ScopedCaps::new(cap_name) } virtiofsd-1.10.0/src/passthrough/file_handle.rs000064400000000000000000000066351046102023000176640ustar 00000000000000// Copyright 2021 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use crate::oslib; use crate::passthrough::mount_fd::{MPRResult, MountFd, MountFds}; use crate::passthrough::stat::MountId; use std::ffi::CStr; use std::fs::File; use std::io; use std::os::unix::io::{AsRawFd, RawFd}; use std::sync::Arc; const EMPTY_CSTR: &[u8] = b"\0"; #[derive(Clone, PartialOrd, Ord, PartialEq, Eq)] pub struct FileHandle { mnt_id: MountId, handle: oslib::CFileHandle, } pub struct OpenableFileHandle { handle: FileHandle, mount_fd: Arc, } pub enum FileOrHandle { File(File), Handle(OpenableFileHandle), } impl FileHandle { /// Create a file handle for the given file. /// /// Return `Ok(None)` if no file handle can be generated for this file: Either because the /// filesystem does not support it, or because it would require a larger file handle than we /// can store. These are not intermittent failures, i.e. if this function returns `Ok(None)` /// for a specific file, it will always return `Ok(None)` for it. Conversely, if this function /// returns `Ok(Some)` at some point, it will never return `Ok(None)` later. /// /// Return an `io::Error` for all other errors. pub fn from_name_at(dir: &impl AsRawFd, path: &CStr) -> io::Result> { let mut mount_id: libc::c_int = 0; let mut c_fh = oslib::CFileHandle::default(); if let Err(err) = oslib::name_to_handle_at(dir, path, &mut c_fh, &mut mount_id, libc::AT_EMPTY_PATH) { match err.raw_os_error() { // Filesystem does not support file handles Some(libc::EOPNOTSUPP) => Ok(None), // Handle would need more bytes than `MAX_HANDLE_SZ` Some(libc::EOVERFLOW) => Ok(None), // Other error _ => Err(err), } } else { Ok(Some(FileHandle { mnt_id: mount_id as MountId, handle: c_fh, })) } } /// Create a file handle for `fd`. /// This is a wrapper around `from_name_at()` and so has the same interface. pub fn from_fd(fd: &impl AsRawFd) -> io::Result> { // Safe because this is a constant value and a valid C string. let empty_path = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; Self::from_name_at(fd, empty_path) } /** * Return an openable copy of the file handle by ensuring that `mount_fds` contains a valid fd * for the mount the file handle is for. * * `reopen_fd` will be invoked to duplicate an `O_PATH` fd with custom `libc::open()` flags. */ pub fn to_openable( &self, mount_fds: &MountFds, reopen_fd: F, ) -> MPRResult where F: FnOnce(RawFd, libc::c_int) -> io::Result, { Ok(OpenableFileHandle { handle: self.clone(), mount_fd: mount_fds.get(self.mnt_id, reopen_fd)?, }) } } impl OpenableFileHandle { pub fn inner(&self) -> &FileHandle { &self.handle } /** * Open a file handle, using our mount FDs hash map. */ pub fn open(&self, flags: libc::c_int) -> io::Result { oslib::open_by_handle_at(self.mount_fd.file(), &self.handle.handle, flags) } } virtiofsd-1.10.0/src/passthrough/inode_store.rs000064400000000000000000000111651046102023000177360ustar 00000000000000// Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. use crate::passthrough::file_handle::{FileHandle, FileOrHandle}; use crate::passthrough::stat::MountId; use crate::passthrough::util::{ebadf, is_safe_inode, reopen_fd_through_proc}; use std::collections::BTreeMap; use std::fs::File; use std::io; use std::os::unix::io::{AsRawFd, RawFd}; use std::sync::atomic::AtomicU64; use std::sync::Arc; pub type Inode = u64; #[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd)] pub struct InodeIds { pub ino: libc::ino64_t, pub dev: libc::dev_t, pub mnt_id: MountId, } pub struct InodeData { pub inode: Inode, // Most of these aren't actually files but ¯\_(ツ)_/¯. pub file_or_handle: FileOrHandle, pub refcount: AtomicU64, // Used as key in the `InodeStore::by_ids` map. pub ids: InodeIds, // File type and mode pub mode: u32, } /** * Represents the file associated with an inode (`InodeData`). * * When obtaining such a file, it may either be a new file (the `Owned` variant), in which case the * object's lifetime is static, or it may reference `InodeData.file` (the `Ref` variant), in which * case the object's lifetime is that of the respective `InodeData` object. */ pub enum InodeFile<'inode_lifetime> { Owned(File), Ref(&'inode_lifetime File), } #[derive(Default)] pub struct InodeStore { data: BTreeMap>, by_ids: BTreeMap, by_handle: BTreeMap, } impl<'a> InodeData { /// Get an `O_PATH` file for this inode pub fn get_file(&'a self) -> io::Result> { match &self.file_or_handle { FileOrHandle::File(f) => Ok(InodeFile::Ref(f)), FileOrHandle::Handle(h) => { let file = h.open(libc::O_PATH)?; Ok(InodeFile::Owned(file)) } } } /// Open this inode with the given flags /// (always returns a new (i.e. `Owned`) file, hence the static lifetime) pub fn open_file( &self, flags: libc::c_int, proc_self_fd: &File, ) -> io::Result> { if !is_safe_inode(self.mode) { return Err(ebadf()); } match &self.file_or_handle { FileOrHandle::File(f) => { let new_file = reopen_fd_through_proc(f, flags, proc_self_fd)?; Ok(InodeFile::Owned(new_file)) } FileOrHandle::Handle(h) => { let new_file = h.open(flags)?; Ok(InodeFile::Owned(new_file)) } } } } impl InodeFile<'_> { /// Create a standalone `File` object pub fn into_file(self) -> io::Result { match self { Self::Owned(file) => Ok(file), Self::Ref(file_ref) => file_ref.try_clone(), } } } impl AsRawFd for InodeFile<'_> { /// Return a file descriptor for this file /// Note: This fd is only valid as long as the `InodeFile` exists. fn as_raw_fd(&self) -> RawFd { match self { Self::Owned(file) => file.as_raw_fd(), Self::Ref(file_ref) => file_ref.as_raw_fd(), } } } impl InodeStore { pub fn insert(&mut self, data: Arc) { self.by_ids.insert(data.ids, data.inode); if let FileOrHandle::Handle(handle) = &data.file_or_handle { self.by_handle.insert(handle.inner().clone(), data.inode); } self.data.insert(data.inode, data); } pub fn remove(&mut self, inode: &Inode) -> Option> { let data = self.data.remove(inode); if let Some(data) = data.as_ref() { if let FileOrHandle::Handle(handle) = &data.file_or_handle { self.by_handle.remove(handle.inner()); } self.by_ids.remove(&data.ids); } data } pub fn clear(&mut self) { self.data.clear(); self.by_handle.clear(); self.by_ids.clear(); } pub fn get(&self, inode: &Inode) -> Option<&Arc> { self.data.get(inode) } pub fn get_by_ids(&self, ids: &InodeIds) -> Option<&Arc> { self.inode_by_ids(ids).map(|inode| self.get(inode).unwrap()) } pub fn get_by_handle(&self, handle: &FileHandle) -> Option<&Arc> { self.inode_by_handle(handle) .map(|inode| self.get(inode).unwrap()) } pub fn inode_by_ids(&self, ids: &InodeIds) -> Option<&Inode> { self.by_ids.get(ids) } pub fn inode_by_handle(&self, handle: &FileHandle) -> Option<&Inode> { self.by_handle.get(handle) } } virtiofsd-1.10.0/src/passthrough/mod.rs000064400000000000000000002537351046102023000162160ustar 00000000000000// Copyright 2019 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. pub mod credentials; pub mod file_handle; pub mod inode_store; pub mod mount_fd; pub mod stat; pub mod util; pub mod xattrmap; use super::fs_cache_req_handler::FsCacheReqHandler; use crate::filesystem::{ Context, Entry, Extensions, FileSystem, FsOptions, GetxattrReply, ListxattrReply, OpenOptions, SecContext, SetattrValid, SetxattrFlags, ZeroCopyReader, ZeroCopyWriter, }; use crate::passthrough::credentials::{drop_effective_cap, UnixCredentials}; use crate::passthrough::inode_store::{Inode, InodeData, InodeFile, InodeIds, InodeStore}; use crate::passthrough::util::{ebadf, is_safe_inode, openat, reopen_fd_through_proc}; use crate::read_dir::ReadDir; use crate::{fuse, oslib}; use file_handle::{FileHandle, FileOrHandle, OpenableFileHandle}; use mount_fd::{MPRError, MountFds}; use stat::{statx, StatExt}; use std::borrow::Cow; use std::collections::{btree_map, BTreeMap}; use std::ffi::{CStr, CString}; use std::fs::File; use std::io; use std::io::ErrorKind; use std::mem::MaybeUninit; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use std::str::FromStr; use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::{Arc, RwLock}; use std::time::Duration; use xattrmap::{AppliedRule, XattrMap}; const EMPTY_CSTR: &[u8] = b"\0"; type Handle = u64; struct HandleData { inode: Inode, file: RwLock, } struct ScopedWorkingDirectory { back_to: RawFd, } impl ScopedWorkingDirectory { fn new(new_wd: RawFd, old_wd: RawFd) -> ScopedWorkingDirectory { oslib::fchdir(new_wd).expect("the working directory should be changed"); ScopedWorkingDirectory { back_to: old_wd } } } impl Drop for ScopedWorkingDirectory { fn drop(&mut self) { oslib::fchdir(self.back_to).expect("the working directory should be changed"); } } fn set_working_directory(new_wd: RawFd, old_wd: RawFd) -> ScopedWorkingDirectory { ScopedWorkingDirectory::new(new_wd, old_wd) } /// The caching policy that the file system should report to the FUSE client. By default the FUSE /// protocol uses close-to-open consistency. This means that any cached contents of the file are /// invalidated the next time that file is opened. #[derive(Default, Debug, Clone)] pub enum CachePolicy { /// The client should never cache file data and all I/O should be directly forwarded to the /// server. This policy must be selected when file contents may change without the knowledge of /// the FUSE client (i.e., the file system does not have exclusive access to the directory). Never, /// This is almost same as Never, but it allows page cache of directories, dentries and attr /// cache in guest. In other words, it acts like cache=never for normal files, and like /// cache=always for directories, besides, metadata like dentries and attrs are kept as well. /// This policy can be used if: /// 1. the client wants to use Never policy but it's performance in I/O is not good enough /// 2. the file system has exclusive access to the directory /// 3. cache directory content and other fs metadata can make a difference on performance. Metadata, /// The client is free to choose when and how to cache file data. This is the default policy and /// uses close-to-open consistency as described in the enum documentation. #[default] Auto, /// The client should always cache file data. This means that the FUSE client will not /// invalidate any cached data that was returned by the file system the last time the file was /// opened. This policy should only be selected when the file system has exclusive access to the /// directory. Always, } impl FromStr for CachePolicy { type Err = &'static str; fn from_str(s: &str) -> Result { match &s.to_lowercase()[..] { "never" => Ok(CachePolicy::Never), "metadata" => Ok(CachePolicy::Metadata), "auto" => Ok(CachePolicy::Auto), "always" => Ok(CachePolicy::Always), _ => Err("invalid cache policy"), } } } /// When to use file handles to reference inodes instead of `O_PATH` file descriptors. #[derive(Default, Debug, Copy, Clone, PartialEq, Eq)] pub enum InodeFileHandlesMode { /// Never use file handles, always use `O_PATH` file descriptors. #[default] Never, /// Attempt to generate file handles, but fall back to `O_PATH` file descriptors where the /// underlying filesystem does not support file handles. Prefer, /// Always use file handles, never fall back to `O_PATH` file descriptors. Mandatory, } /// Options that configure the behavior of the file system. #[derive(Debug)] pub struct Config { /// How long the FUSE client should consider directory entries to be valid. If the contents of a /// directory can only be modified by the FUSE client (i.e., the file system has exclusive /// access), then this should be a large value. /// /// The default value for this option is 5 seconds. pub entry_timeout: Duration, /// How long the FUSE client should consider file and directory attributes to be valid. If the /// attributes of a file or directory can only be modified by the FUSE client (i.e., the file /// system has exclusive access), then this should be set to a large value. /// /// The default value for this option is 5 seconds. pub attr_timeout: Duration, /// The caching policy the file system should use. See the documentation of `CachePolicy` for /// more details. pub cache_policy: CachePolicy, /// Whether the file system should enabled writeback caching. This can improve performance as it /// allows the FUSE client to cache and coalesce multiple writes before sending them to the file /// system. However, enabling this option can increase the risk of data corruption if the file /// contents can change without the knowledge of the FUSE client (i.e., the server does **NOT** /// have exclusive access). Additionally, the file system should have read access to all files /// in the directory it is serving as the FUSE client may send read requests even for files /// opened with `O_WRONLY`. /// /// Therefore callers should only enable this option when they can guarantee that: 1) the file /// system has exclusive access to the directory and 2) the file system has read permissions for /// all files in that directory. /// /// The default value for this option is `false`. pub writeback: bool, /// The path of the root directory. /// /// The default is `/`. pub root_dir: String, /// A prefix to strip from the mount points listed in /proc/self/mountinfo. /// /// The default is `None`. pub mountinfo_prefix: Option, /// Whether the file system should support Extended Attributes (xattr). Enabling this feature may /// have a significant impact on performance, especially on write parallelism. This is the result /// of FUSE attempting to remove the special file privileges after each write request. /// /// The default value for this options is `false`. pub xattr: bool, /// An optional translation layer for host<->guest Extended Attribute (xattr) names. pub xattrmap: Option, /// The xattr name that "security.capability" is remapped to, if the client remapped it at all. /// If the client's xattrmap did not remap "security.capability", this will be `None`. pub xattr_security_capability: Option, /// Optional `File` object for /proc/self/fd. Callers can open a `File` and pass it here, so /// there's no need to open it in PassthroughFs::new(). This is specially useful for /// sandboxing. /// /// The default is `None`. pub proc_sfd_rawfd: Option, /// Optional `File` object for /proc/self/mountinfo. Callers can open a `File` and pass it /// here, so there is no need to open it in PassthroughFs::new(). This is especially useful /// for sandboxing. /// /// The default is `None`. pub proc_mountinfo_rawfd: Option, /// Whether the file system should announce submounts to the guest. Not doing so means that /// the FUSE client may see st_ino collisions: This stat field is passed through, so if the /// shared directory encompasses multiple mounts, some inodes (in different file systems) may /// have the same st_ino value. If the FUSE client does not know these inodes are in different /// file systems, then it will be oblivious to this collision. /// By announcing submount points, the FUSE client can create virtual submounts with distinct /// st_dev values where necessary, so that the combination of st_dev and st_ino will stay /// unique. /// On the other hand, it may be undesirable to let the client know the shared directory's /// submount structure. The user needs to decide which drawback weighs heavier for them, which /// is why this is a configurable option. /// /// The default is `false`. pub announce_submounts: bool, /// Whether to use file handles to reference inodes. We need to be able to open file /// descriptors for arbitrary inodes, and by default that is done by storing an `O_PATH` FD in /// `InodeData`. Not least because there is a maximum number of FDs a process can have open /// users may find it preferable to store a file handle instead, which we can use to open an FD /// when necessary. /// So this switch allows to choose between the alternatives: When set to `Never`, `InodeData` /// will store `O_PATH` FDs. Otherwise, we will attempt to generate and store a file handle /// instead. With `Prefer`, errors that are inherent to file handles (like no support from the /// underlying filesystem) lead to falling back to `O_PATH` FDs, and only generic errors (like /// `ENOENT` or `ENOMEM`) are passed to the guest. `Mandatory` enforces the use of file /// handles, returning all errors to the guest. /// /// The default is `Never`. pub inode_file_handles: InodeFileHandlesMode, /// Whether the file system should support READDIRPLUS (READDIR+LOOKUP) operations. /// /// The default is `false`. pub readdirplus: bool, /// Whether the file system should honor the O_DIRECT flag. If this option is disabled (which /// is the default value), that flag will be filtered out at `open_inode`. /// /// The default is `false`. pub allow_direct_io: bool, /// If `killpriv_v2` is true then it indicates that the file system is expected to clear the /// setuid and setgid bits. pub killpriv_v2: bool, /// Enable support for posix ACLs /// /// The default is `false`. pub posix_acl: bool, /// If `security_label` is true, then server will indicate to client /// to send any security context associated with file during file /// creation and set that security context on newly created file. /// This security context is expected to be security.selinux. /// /// The default is `false`. pub security_label: bool, /// If `clean_noatime` is true automatically clean up O_NOATIME flag to prevent potential /// permission errors. pub clean_noatime: bool, } impl Default for Config { fn default() -> Self { Config { entry_timeout: Duration::from_secs(5), attr_timeout: Duration::from_secs(5), cache_policy: Default::default(), writeback: false, root_dir: String::from("/"), mountinfo_prefix: None, xattr: false, xattrmap: None, xattr_security_capability: None, proc_sfd_rawfd: None, proc_mountinfo_rawfd: None, announce_submounts: false, inode_file_handles: Default::default(), readdirplus: true, allow_direct_io: false, killpriv_v2: false, posix_acl: false, security_label: false, clean_noatime: true, } } } /// A file system that simply "passes through" all requests it receives to the underlying file /// system. To keep the implementation simple it servers the contents of its root directory. Users /// that wish to serve only a specific directory should set up the environment so that that /// directory ends up as the root of the file system process. One way to accomplish this is via a /// combination of mount namespaces and the pivot_root system call. pub struct PassthroughFs { // File descriptors for various points in the file system tree. These fds are always opened with // the `O_PATH` option so they cannot be used for reading or writing any data. See the // documentation of the `O_PATH` flag in `open(2)` for more details on what one can and cannot // do with an fd opened with this flag. inodes: RwLock, next_inode: AtomicU64, // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be // used for reading and writing data. handles: RwLock>>, next_handle: AtomicU64, // Maps mount IDs to an open FD on the respective ID for the purpose of open_by_handle_at(). // This is set when inode_file_handles is not never, since in the 'never' case, // open_by_handle_at() is not called. mount_fds: Option, // File descriptor pointing to the `/proc/self/fd` directory. This is used to convert an fd from // `inodes` into one that can go into `handles`. This is accomplished by reading the // `/proc/self/fd/{}` symlink. We keep an open fd here in case the file system tree that we are // meant to be serving doesn't have access to `/proc/self/fd`. proc_self_fd: File, // File descriptor pointing to the `/` directory. root_fd: File, // Whether writeback caching is enabled for this directory. This will only be true when // `cfg.writeback` is true and `init` was called with `FsOptions::WRITEBACK_CACHE`. writeback: AtomicBool, // Whether to announce submounts (i.e., whether the guest supports them and whether they are // enabled in the configuration) announce_submounts: AtomicBool, // Whether posix ACLs is enabled. posix_acl: AtomicBool, // Basic facts about the OS os_facts: oslib::OsFacts, // Whether the guest kernel supports the supplementary group extension. sup_group_extension: AtomicBool, cfg: Config, } impl PassthroughFs { pub fn new(mut cfg: Config) -> io::Result { let proc_self_fd = if let Some(fd) = cfg.proc_sfd_rawfd.take() { fd } else { openat( &libc::AT_FDCWD, "/proc/self/fd", libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC, )? }; let root_fd = openat( &libc::AT_FDCWD, "/", libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC, )?; let mount_fds = if cfg.inode_file_handles == InodeFileHandlesMode::Never { None } else { let mountinfo_fd = if let Some(fd) = cfg.proc_mountinfo_rawfd.take() { fd } else { openat( &libc::AT_FDCWD, "/proc/self/mountinfo", libc::O_RDONLY | libc::O_NOFOLLOW | libc::O_CLOEXEC, )? }; Some(MountFds::new(mountinfo_fd, cfg.mountinfo_prefix.clone())) }; let mut fs = PassthroughFs { inodes: RwLock::new(Default::default()), next_inode: AtomicU64::new(fuse::ROOT_ID + 1), handles: RwLock::new(BTreeMap::new()), next_handle: AtomicU64::new(0), mount_fds, proc_self_fd, root_fd, writeback: AtomicBool::new(false), announce_submounts: AtomicBool::new(false), posix_acl: AtomicBool::new(false), sup_group_extension: AtomicBool::new(false), os_facts: oslib::OsFacts::new(), cfg, }; // Check to see if the client remapped "security.capability", if so, // stash its mapping since the daemon will have to enforce semantics // that the host kernel otherwise would if the xattrname was not mapped. let sec_xattr = unsafe { CStr::from_bytes_with_nul_unchecked(b"security.capability\0") }; fs.cfg.xattr_security_capability = fs .map_client_xattrname(sec_xattr) .ok() .filter(|n| !sec_xattr.eq(n)) .map(CString::from); fs.check_working_file_handles()?; // We need to clear the umask here because we want the client to be // able to set all the bits in the mode. oslib::umask(0o000); Ok(fs) } pub fn keep_fds(&self) -> Vec { vec![self.proc_self_fd.as_raw_fd()] } fn open_relative_to( &self, dir: &impl AsRawFd, pathname: &CStr, flags: i32, mode: Option, ) -> io::Result { let flags = libc::O_NOFOLLOW | libc::O_CLOEXEC | flags; if self.os_facts.has_openat2 { oslib::do_open_relative_to(dir, pathname, flags, mode) } else { oslib::openat(dir, pathname, flags, mode) } } fn find_handle(&self, handle: Handle, inode: Inode) -> io::Result> { self.handles .read() .unwrap() .get(&handle) .filter(|hd| hd.inode == inode) .map(Arc::clone) .ok_or_else(ebadf) } fn open_inode(&self, inode: Inode, mut flags: i32) -> io::Result { let data = self .inodes .read() .unwrap() .get(&inode) .map(Arc::clone) .ok_or_else(ebadf)?; // When writeback caching is enabled, the kernel may send read requests even if the // userspace program opened the file write-only. So we need to ensure that we have opened // the file for reading as well as writing. let writeback = self.writeback.load(Ordering::Relaxed); if writeback && flags & libc::O_ACCMODE == libc::O_WRONLY { flags &= !libc::O_ACCMODE; flags |= libc::O_RDWR; } // When writeback caching is enabled the kernel is responsible for handling `O_APPEND`. // However, this breaks atomicity as the file may have changed on disk, invalidating the // cached copy of the data in the kernel and the offset that the kernel thinks is the end of // the file. Just allow this for now as it is the user's responsibility to enable writeback // caching only for directories that are not shared. It also means that we need to clear the // `O_APPEND` flag. if writeback && flags & libc::O_APPEND != 0 { flags &= !libc::O_APPEND; } if !self.cfg.allow_direct_io && flags & libc::O_DIRECT != 0 { flags &= !libc::O_DIRECT; } data.open_file(flags | libc::O_CLOEXEC, &self.proc_self_fd)? .into_file() } /// Generate a file handle for `fd` using `FileHandle::from_fd()`. `st` is `fd`'s stat /// information (we may need the mount ID for errors/warnings). /// /// These are the possible return values: /// - `Ok(Some(_))`: Success, caller should use this file handle. /// - `Ok(None)`: No error, but no file handle is available. The caller should fall back to /// using an `O_PATH` FD. /// - `Err(_)`: An error occurred, the caller should return this to the guest. /// /// This function takes the chosen `self.cfg.inode_file_handles` mode into account: /// - `Never`: Always return `Ok(None)`. /// - `Prefer`: Return `Ok(None)` when file handles are not supported by this filesystem. /// Otherwise, return either `Ok(Some(_))` or `Err(_)`, depending on whether a file /// handle could be generated or not. /// - `Mandatory`: Never return `Ok(None)`. When the filesystem does not support file handles, /// return an `Err(_)`. /// /// When the filesystem does not support file handles, this is logged (as a warning in /// `Prefer` mode, and as an error in `Mandatory` mode) one time per filesystem. fn get_file_handle_opt( &self, fd: &impl AsRawFd, st: &StatExt, ) -> io::Result> { let handle = match self.cfg.inode_file_handles { InodeFileHandlesMode::Never => { // Let's make this quick, so we can skip this case below return Ok(None); } InodeFileHandlesMode::Prefer | InodeFileHandlesMode::Mandatory => { FileHandle::from_fd(fd)? } }; if handle.is_none() { // No error, but no handle (because of EOPNOTSUPP/EOVERFLOW)? Log it. let io_err = io::Error::from_raw_os_error(libc::EOPNOTSUPP); let desc = match self.cfg.inode_file_handles { InodeFileHandlesMode::Never => unreachable!(), InodeFileHandlesMode::Prefer => { "Filesystem does not support file handles, falling back to O_PATH FDs" } InodeFileHandlesMode::Mandatory => "Filesystem does not support file handles", }; // Use the MPRError object, because (with a mount ID obtained through statx()) // `self.mount_fds.error_for()` will attempt to add a prefix to the error description // that describes the offending filesystem by mount point and mount ID, and will also // suppress the message if we have already logged any error concerning file handles for // the respective filesystem (so we only log errors/warnings once). let err: MPRError = if st.mnt_id > 0 { // Valid mount ID // self.mount_fds won't be None if we enter here. self.mount_fds .as_ref() .unwrap() .error_for(st.mnt_id, io_err) } else { // No valid mount ID, return error object not bound to a filesystem io_err.into() } .set_desc(desc.to_string()); // In `Prefer` mode, warn; in `Mandatory` mode, log and return an error. // (Suppress logging if the error is silenced, which means that we have already logged // a warning/error for this filesystem.) match self.cfg.inode_file_handles { InodeFileHandlesMode::Never => unreachable!(), InodeFileHandlesMode::Prefer => { if !err.silent() { warn!("{}", err); } } InodeFileHandlesMode::Mandatory => { if !err.silent() { error!("{}", err); } return Err(err.into_inner()); } } } Ok(handle) } fn make_file_handle_openable(&self, fh: &FileHandle) -> io::Result { // self.mount_fds won't be None if we enter here. fh.to_openable(self.mount_fds.as_ref().unwrap(), |fd, flags| { reopen_fd_through_proc(&fd, flags, &self.proc_self_fd) }) .map_err(|e| { if !e.silent() { error!("{}", e); } e.into_inner() }) } fn check_working_file_handles(&mut self) -> io::Result<()> { if self.cfg.inode_file_handles == InodeFileHandlesMode::Never { // No need to check anything return Ok(()); } // Try to open the root directory, turn it into a file handle, then try to open that file // handle to see whether file handles do indeed work // (Note that we pass through all I/O errors to the caller, because `PassthroughFs::init()` // will do these calls (`openat()`, `stat()`, etc.) anyway, so if they do not work now, // they probably are not going to work later either. Better to report errors early then.) let root_dir = openat( &libc::AT_FDCWD, self.cfg.root_dir.as_str(), libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC, )?; let st = statx(&root_dir, None)?; if let Some(h) = self.get_file_handle_opt(&root_dir, &st)? { // Got an openable file handle, try opening it match self.make_file_handle_openable(&h)?.open(libc::O_PATH) { Ok(_) => (), Err(e) => match self.cfg.inode_file_handles { InodeFileHandlesMode::Never => unreachable!(), InodeFileHandlesMode::Prefer => { warn!("Failed to open file handle for the root node: {}", e); warn!("File handles do not appear safe to use, disabling file handles altogether"); self.cfg.inode_file_handles = InodeFileHandlesMode::Never; } InodeFileHandlesMode::Mandatory => { error!("Failed to open file handle for the root node: {}", e); error!("Refusing to use (mandatory) file handles, as they do not appear safe to use"); return Err(e); } }, } } else { // Did not get an openable file handle (nor an error), so we cannot be in `mandatory` // mode. We also cannot be in `never` mode, because that is sorted out at the very // beginning of this function. Still, use `match` so the compiler could warn us if we // were to forget some (future?) variant. match self.cfg.inode_file_handles { InodeFileHandlesMode::Never => unreachable!(), InodeFileHandlesMode::Prefer => { warn!("Failed to generate a file handle for the root node, disabling file handles altogether"); self.cfg.inode_file_handles = InodeFileHandlesMode::Never; } InodeFileHandlesMode::Mandatory => unreachable!(), } } Ok(()) } fn do_lookup(&self, parent: Inode, name: &CStr) -> io::Result { let p = self .inodes .read() .unwrap() .get(&parent) .map(Arc::clone) .ok_or_else(ebadf)?; let p_file = p.get_file()?; let path_fd = { let fd = self.open_relative_to(&p_file, name, libc::O_PATH, None)?; // Safe because we just opened this fd. unsafe { File::from_raw_fd(fd) } }; let st = statx(&path_fd, None)?; // Note that this will always be `None` if `cfg.inode_file_handles` is `Never`, but we only // really need the handle when we do not have an `O_PATH` fd open for every inode. So if // `cfg.inode_file_handles` is `Never`, we do not need it anyway. let handle = self.get_file_handle_opt(&path_fd, &st)?; let mut attr_flags: u32 = 0; if st.st.st_mode & libc::S_IFMT == libc::S_IFDIR && self.announce_submounts.load(Ordering::Relaxed) && (st.st.st_dev != p.ids.dev || st.mnt_id != p.ids.mnt_id) { attr_flags |= fuse::ATTR_SUBMOUNT; } let ids = InodeIds { ino: st.st.st_ino, dev: st.st.st_dev, mnt_id: st.mnt_id, }; let existing_inode = Self::claim_inode(&self.inodes.read().unwrap(), handle.as_ref(), &ids); let inode = if let Some(inode) = existing_inode { inode } else { let file_or_handle = if let Some(h) = handle.as_ref() { FileOrHandle::Handle(self.make_file_handle_openable(h)?) } else { FileOrHandle::File(path_fd) }; // There is a possible race here where two (or more) threads end up creating an inode // ID. However, only the one in the thread that locks `self.inodes` first will be used // and the others are wasted. let inode = self.next_inode.fetch_add(1, Ordering::Relaxed); let mut inodes = self.inodes.write().unwrap(); if let Some(inode) = Self::claim_inode(&inodes, handle.as_ref(), &ids) { // An inode was added concurrently while we did not hold a lock on `self.inodes`, so // we use that instead. `file_or_handle` will be dropped. inode } else { inodes.insert(Arc::new(InodeData { inode, file_or_handle, refcount: AtomicU64::new(1), ids, mode: st.st.st_mode, })); inode } }; Ok(Entry { inode, generation: 0, attr: st.st, attr_flags, attr_timeout: self.cfg.attr_timeout, entry_timeout: self.cfg.entry_timeout, }) } /// Attempts to get an inode from `inodes` and increment its refcount. Returns the inode /// number on success and `None` on failure. Reasons for failure can be that the inode isn't /// in the map or that the refcount is zero. This function will never increment a refcount /// that's already zero. fn claim_inode( inodes: &InodeStore, handle: Option<&FileHandle>, ids: &InodeIds, ) -> Option { let data = handle.and_then(|h| inodes.get_by_handle(h)).or_else(|| { inodes.get_by_ids(ids).filter(|data| { // When we have to fall back to looking up an inode by its inode ID, ensure // that we hit an entry that has a valid file descriptor. Having an FD // open means that the inode cannot really be deleted until the FD is // closed, so that the inode ID remains valid until we evict the // `InodeData`. With no FD open (and just a file handle), the inode can be // deleted while we still have our `InodeData`, and so the inode ID may be // reused by a completely different new inode. Such inodes must be looked // up by file handle, because this handle contains a generation ID to // differentiate between the old and the new inode. matches!(data.file_or_handle, FileOrHandle::File(_)) }) }); if let Some(data) = data { // We use a CAS loop instead of `fetch_add()`, because we must never increment the // refcount from zero to one. let mut n = data.refcount.load(Ordering::Relaxed); loop { if n == 0 { return None; } match data.refcount.compare_exchange_weak( n, n + 1, Ordering::Relaxed, Ordering::Relaxed, ) { Ok(_) => return Some(data.inode), Err(old) => n = old, } } } None } fn do_open( &self, inode: Inode, kill_priv: bool, flags: u32, ) -> io::Result<(Option, OpenOptions)> { // We need to clean the `O_APPEND` flag in case the file is mem mapped or if the flag // is later modified in the guest using `fcntl(F_SETFL)`. We do a per-write `O_APPEND` // check setting `RWF_APPEND` for non-mmapped writes, if necessary. let mut flags = flags & !(libc::O_APPEND as u32); // Clean O_NOATIME (unless specified otherwise with --preserve-noatime) to prevent // potential permission errors when running in unprivileged mode. if self.cfg.clean_noatime { flags &= !(libc::O_NOATIME as u32) } let file = RwLock::new({ let _killpriv_guard = if self.cfg.killpriv_v2 && kill_priv { drop_effective_cap("FSETID")? } else { None }; self.open_inode(inode, flags as i32)? }); if flags & (libc::O_TRUNC as u32) != 0 { let file = file.read().expect("poisoned lock"); self.clear_file_capabilities(file.as_raw_fd(), false)?; } let handle = self.next_handle.fetch_add(1, Ordering::Relaxed); let data = HandleData { inode, file }; self.handles.write().unwrap().insert(handle, Arc::new(data)); let mut opts = OpenOptions::empty(); match self.cfg.cache_policy { // We only set the direct I/O option on files. CachePolicy::Never => opts.set( OpenOptions::DIRECT_IO, flags & (libc::O_DIRECTORY as u32) == 0, ), CachePolicy::Metadata => { if flags & (libc::O_DIRECTORY as u32) == 0 { opts |= OpenOptions::DIRECT_IO; } else { opts |= OpenOptions::CACHE_DIR | OpenOptions::KEEP_CACHE; } } CachePolicy::Always => { opts |= OpenOptions::KEEP_CACHE; if flags & (libc::O_DIRECTORY as u32) != 0 { opts |= OpenOptions::CACHE_DIR; } } _ => {} }; Ok((Some(handle), opts)) } fn do_release(&self, inode: Inode, handle: Handle) -> io::Result<()> { let mut handles = self.handles.write().unwrap(); if let btree_map::Entry::Occupied(e) = handles.entry(handle) { if e.get().inode == inode { // We don't need to close the file here because that will happen automatically when // the last `Arc` is dropped. e.remove(); return Ok(()); } } Err(ebadf()) } fn do_getattr(&self, inode: Inode) -> io::Result<(libc::stat64, Duration)> { let data = self .inodes .read() .unwrap() .get(&inode) .map(Arc::clone) .ok_or_else(ebadf)?; let inode_file = data.get_file()?; let st = statx(&inode_file, None)?.st; Ok((st, self.cfg.attr_timeout)) } fn do_unlink(&self, parent: Inode, name: &CStr, flags: libc::c_int) -> io::Result<()> { let data = self .inodes .read() .unwrap() .get(&parent) .map(Arc::clone) .ok_or_else(ebadf)?; let parent_file = data.get_file()?; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::unlinkat(parent_file.as_raw_fd(), name.as_ptr(), flags) }; if res == 0 { Ok(()) } else { Err(io::Error::last_os_error()) } } fn block_xattr(&self, name: &[u8]) -> bool { // Currently we only filter out posix acl xattrs. // If acls are enabled, there is nothing to filter. if self.posix_acl.load(Ordering::Relaxed) { return false; } let acl_access = "system.posix_acl_access".as_bytes(); let acl_default = "system.posix_acl_default".as_bytes(); acl_access.starts_with(name) || acl_default.starts_with(name) } fn map_client_xattrname<'a>(&self, name: &'a CStr) -> std::io::Result> { if self.block_xattr(name.to_bytes()) { return Err(io::Error::from_raw_os_error(libc::ENOTSUP)); } match &self.cfg.xattrmap { Some(map) => match map.map_client_xattr(name).expect("unterminated mapping") { AppliedRule::Deny => Err(io::Error::from_raw_os_error(libc::EPERM)), AppliedRule::Unsupported => Err(io::Error::from_raw_os_error(libc::ENOTSUP)), AppliedRule::Pass(new_name) => Ok(new_name), }, None => Ok(Cow::Borrowed(name)), } } fn map_server_xattrlist(&self, xattr_names: Vec) -> Vec { let all_xattrs = match &self.cfg.xattrmap { Some(map) => map .map_server_xattrlist(xattr_names) .expect("unterminated mapping"), None => xattr_names, }; // filter out the blocked xattrs let mut filtered = Vec::with_capacity(all_xattrs.len()); let all_xattrs = all_xattrs.split(|b| *b == 0).filter(|bs| !bs.is_empty()); for xattr in all_xattrs { if !self.block_xattr(xattr) { filtered.extend_from_slice(xattr); filtered.push(0); } } filtered.shrink_to_fit(); filtered } /// Clears file capabilities /// /// * `fd` - A file descriptor /// * `o_path` - Must be `true` if the file referred to by `fd` was opened with the `O_PATH` flag /// /// If it is not clear whether `fd` was opened with `O_PATH` it is safe to set `o_path` /// to `true`. fn clear_file_capabilities(&self, fd: RawFd, o_path: bool) -> io::Result<()> { match self.cfg.xattr_security_capability.as_ref() { // Unmapped, let the kernel take care of this. None => Ok(()), // Otherwise we have to uphold the same semantics the kernel // would; which is to drop the "security.capability" xattr // on write Some(xattrname) => { let res = if o_path { let proc_file_name = CString::new(format!("{fd}")) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; let _working_dir_guard = set_working_directory( self.proc_self_fd.as_raw_fd(), self.root_fd.as_raw_fd(), ); unsafe { libc::removexattr(proc_file_name.as_ptr(), xattrname.as_ptr()) } } else { unsafe { libc::fremovexattr(fd, xattrname.as_ptr()) } }; if res == 0 { Ok(()) } else { let eno = io::Error::last_os_error(); match eno.raw_os_error().unwrap() { libc::ENODATA | libc::ENOTSUP => Ok(()), _ => Err(eno), } } } } } #[allow(clippy::too_many_arguments)] fn do_create( &self, ctx: &Context, parent_file: &InodeFile, name: &CStr, mode: u32, flags: u32, umask: u32, extensions: Extensions, ) -> io::Result { let fd = { let _credentials_guard = UnixCredentials::new(ctx.uid, ctx.gid) .supplementary_gid( self.sup_group_extension.load(Ordering::Relaxed), extensions.sup_gid, ) .set()?; let _umask_guard = self .posix_acl .load(Ordering::Relaxed) .then(|| oslib::ScopedUmask::new(umask)); // Add libc:O_EXCL to ensure we're not accidentally opening a file the guest wouldn't // be allowed to access otherwise. self.open_relative_to( parent_file, name, flags as i32 | libc::O_CREAT | libc::O_EXCL, mode.into(), )? }; // Set security context if let Some(secctx) = extensions.secctx { // Remap security xattr name. let xattr_name = match self.map_client_xattrname(&secctx.name) { Ok(xattr_name) => xattr_name, Err(e) => { unsafe { libc::unlinkat(parent_file.as_raw_fd(), name.as_ptr(), 0); } return Err(e); } }; let ret = unsafe { libc::fsetxattr( fd, xattr_name.as_ptr(), secctx.secctx.as_ptr() as *const libc::c_void, secctx.secctx.len(), 0, ) }; if ret != 0 { unsafe { libc::unlinkat(parent_file.as_raw_fd(), name.as_ptr(), 0); } return Err(io::Error::last_os_error()); } } Ok(fd) } fn do_mknod_mkdir_symlink_secctx( &self, parent_file: &InodeFile, name: &CStr, secctx: &SecContext, ) -> io::Result<()> { // Remap security xattr name. let xattr_name = self.map_client_xattrname(&secctx.name)?; // Set security context on newly created node. It could be // device node as well, so it is not safe to open the node // and call fsetxattr(). Instead, use the fchdir(proc_fd) // and call setxattr(o_path_fd). We use this trick while // setting xattr as well. // Open O_PATH fd for dir/symlink/special node just created. let path_fd = self.open_relative_to(parent_file, name, libc::O_PATH, None)?; let procname = CString::new(format!("{path_fd}")) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)); let procname = match procname { Ok(name) => name, Err(error) => { return Err(error); } }; let _working_dir_guard = set_working_directory(self.proc_self_fd.as_raw_fd(), self.root_fd.as_raw_fd()); let res = unsafe { libc::setxattr( procname.as_ptr(), xattr_name.as_ptr(), secctx.secctx.as_ptr() as *const libc::c_void, secctx.secctx.len(), 0, ) }; let res_err = io::Error::last_os_error(); if res == 0 { Ok(()) } else { Err(res_err) } } } fn forget_one(inodes: &mut InodeStore, inode: Inode, count: u64) { if let Some(data) = inodes.get(&inode) { // Acquiring the write lock on the inode map prevents new lookups from incrementing the // refcount but there is the possibility that a previous lookup already acquired a // reference to the inode data and is in the process of updating the refcount so we need // to loop here until we can decrement successfully. loop { let refcount = data.refcount.load(Ordering::Relaxed); // Saturating sub because it doesn't make sense for a refcount to go below zero and // we don't want misbehaving clients to cause integer overflow. let new_count = refcount.saturating_sub(count); // We don't need any stronger ordering, because the refcount itself doesn't protect any // data. The `inodes` map is protected since we hold an exclusive reference (obtained // from an `RwLock`). if data.refcount.compare_exchange( refcount, new_count, Ordering::Relaxed, Ordering::Relaxed, ) == Ok(refcount) { if new_count == 0 { // We just removed the last refcount for this inode. There's no need for an // acquire fence here because we hold a write lock on the inode map and any // thread that is waiting to do a forget on the same inode will have to wait // until we release the lock. So there's is no other release store for us to // synchronize with before deleting the entry. inodes.remove(&inode); } break; } } } } impl FileSystem for PassthroughFs { type Inode = Inode; type Handle = Handle; type DirIter = ReadDir>; fn init(&self, capable: FsOptions) -> io::Result { // We use `O_PATH` because we just want this for traversing the directory tree // and not for actually reading the contents. We don't use `open_relative_to()` // here because we are not opening a guest-provided pathname. Also, `self.cfg.root_dir` // is an absolute pathname, thus not relative to CWD, so we will not be able to open it // if "/" didn't change (e.g., chroot or pivot_root) let path_fd = openat( &libc::AT_FDCWD, self.cfg.root_dir.as_str(), libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC, )?; let st = statx(&path_fd, None)?; let handle = self.get_file_handle_opt(&path_fd, &st)?; let file_or_handle = if let Some(h) = handle.as_ref() { FileOrHandle::Handle(self.make_file_handle_openable(h)?) } else { FileOrHandle::File(path_fd) }; let mut inodes = self.inodes.write().unwrap(); // Not sure why the root inode gets a refcount of 2 but that's what libfuse does. inodes.insert(Arc::new(InodeData { inode: fuse::ROOT_ID, file_or_handle, refcount: AtomicU64::new(2), ids: InodeIds { ino: st.st.st_ino, dev: st.st.st_dev, mnt_id: st.mnt_id, }, mode: st.st.st_mode, })); let mut opts = if self.cfg.readdirplus { FsOptions::DO_READDIRPLUS | FsOptions::READDIRPLUS_AUTO } else { FsOptions::empty() }; if self.cfg.writeback && capable.contains(FsOptions::WRITEBACK_CACHE) { opts |= FsOptions::WRITEBACK_CACHE; self.writeback.store(true, Ordering::Relaxed); } if self.cfg.announce_submounts { if capable.contains(FsOptions::SUBMOUNTS) { self.announce_submounts.store(true, Ordering::Relaxed); } else { eprintln!("Warning: Cannot announce submounts, client does not support it"); } } if self.cfg.killpriv_v2 { if capable.contains(FsOptions::HANDLE_KILLPRIV_V2) { opts |= FsOptions::HANDLE_KILLPRIV_V2; } else { warn!("Cannot enable KILLPRIV_V2, client does not support it"); } } if self.cfg.posix_acl { let acl_required_flags = FsOptions::POSIX_ACL | FsOptions::DONT_MASK | FsOptions::SETXATTR_EXT; if capable.contains(acl_required_flags) { opts |= acl_required_flags; self.posix_acl.store(true, Ordering::Relaxed); debug!("init: enabling posix acl"); } else { error!("Cannot enable posix ACLs, client does not support it"); return Err(io::Error::from_raw_os_error(libc::EPROTO)); } } if self.cfg.security_label { if capable.contains(FsOptions::SECURITY_CTX) { opts |= FsOptions::SECURITY_CTX; } else { error!("Cannot enable security label. kernel does not support FUSE_SECURITY_CTX capability"); return Err(io::Error::from_raw_os_error(libc::EPROTO)); } } if capable.contains(FsOptions::CREATE_SUPP_GROUP) { self.sup_group_extension.store(true, Ordering::Relaxed); } Ok(opts) } fn destroy(&self) { self.handles.write().unwrap().clear(); self.inodes.write().unwrap().clear(); self.writeback.store(false, Ordering::Relaxed); self.announce_submounts.store(false, Ordering::Relaxed); self.posix_acl.store(false, Ordering::Relaxed); self.sup_group_extension.store(false, Ordering::Relaxed); } fn statfs(&self, _ctx: Context, inode: Inode) -> io::Result { let data = self .inodes .read() .unwrap() .get(&inode) .map(Arc::clone) .ok_or_else(ebadf)?; let inode_file = data.get_file()?; let mut out = MaybeUninit::::zeroed(); // Safe because this will only modify `out` and we check the return value. let res = unsafe { libc::fstatvfs64(inode_file.as_raw_fd(), out.as_mut_ptr()) }; if res == 0 { // Safe because the kernel guarantees that `out` has been initialized. Ok(unsafe { out.assume_init() }) } else { Err(io::Error::last_os_error()) } } fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result { self.do_lookup(parent, name) } fn forget(&self, _ctx: Context, inode: Inode, count: u64) { let mut inodes = self.inodes.write().unwrap(); forget_one(&mut inodes, inode, count) } fn batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>) { let mut inodes = self.inodes.write().unwrap(); for (inode, count) in requests { forget_one(&mut inodes, inode, count) } } fn opendir( &self, _ctx: Context, inode: Inode, flags: u32, ) -> io::Result<(Option, OpenOptions)> { self.do_open(inode, false, flags | (libc::O_DIRECTORY as u32)) } fn releasedir( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, ) -> io::Result<()> { self.do_release(inode, handle) } fn mkdir( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, umask: u32, extensions: Extensions, ) -> io::Result { let data = self .inodes .read() .unwrap() .get(&parent) .map(Arc::clone) .ok_or_else(ebadf)?; let parent_file = data.get_file()?; let res = { let _credentials_guard = UnixCredentials::new(ctx.uid, ctx.gid) .supplementary_gid( self.sup_group_extension.load(Ordering::Relaxed), extensions.sup_gid, ) .set()?; let _umask_guard = self .posix_acl .load(Ordering::Relaxed) .then(|| oslib::ScopedUmask::new(umask)); // Safe because this doesn't modify any memory and we check the return value. unsafe { libc::mkdirat(parent_file.as_raw_fd(), name.as_ptr(), mode) } }; if res < 0 { return Err(io::Error::last_os_error()); } // Set security context on dir. if let Some(secctx) = extensions.secctx { if let Err(e) = self.do_mknod_mkdir_symlink_secctx(&parent_file, name, &secctx) { unsafe { libc::unlinkat(parent_file.as_raw_fd(), name.as_ptr(), libc::AT_REMOVEDIR); }; return Err(e); } } self.do_lookup(parent, name) } fn rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> { self.do_unlink(parent, name, libc::AT_REMOVEDIR) } fn readdir( &self, _ctx: Context, inode: Inode, handle: Handle, size: u32, offset: u64, ) -> io::Result { if size == 0 { return Ok(ReadDir::default()); } let data = self.find_handle(handle, inode)?; let buf = vec![0; size as usize]; // Since we are going to work with the kernel offset, we have to acquire the file // lock for both the `lseek64` and `getdents64` syscalls to ensure that no other // thread changes the kernel offset while we are using it. let dir = data.file.write().unwrap(); ReadDir::new(&*dir, offset as libc::off64_t, buf) } fn open( &self, _ctx: Context, inode: Inode, kill_priv: bool, flags: u32, ) -> io::Result<(Option, OpenOptions)> { self.do_open(inode, kill_priv, flags) } fn release( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, _flush: bool, _flock_release: bool, _lock_owner: Option, ) -> io::Result<()> { self.do_release(inode, handle) } fn create( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, kill_priv: bool, flags: u32, umask: u32, extensions: Extensions, ) -> io::Result<(Entry, Option, OpenOptions)> { let data = self .inodes .read() .unwrap() .get(&parent) .map(Arc::clone) .ok_or_else(ebadf)?; let parent_file = data.get_file()?; // We need to clean the `O_APPEND` flag in case the file is mem mapped or if the flag // is later modified in the guest using `fcntl(F_SETFL)`. We do a per-write `O_APPEND` // check setting `RWF_APPEND` for non-mmapped writes, if necessary. let create_flags = flags & !(libc::O_APPEND as u32); let fd = self.do_create( &ctx, &parent_file, name, mode, create_flags, umask, extensions, ); let (entry, handle) = match fd { Err(last_error) => { // Ignore the error if the file exists and O_EXCL is not present in `flags` match last_error.kind() { io::ErrorKind::AlreadyExists => { if (flags as i32 & libc::O_EXCL) != 0 { return Err(last_error); } } _ => return Err(last_error), } let entry = self.do_lookup(parent, name)?; let (handle, _) = self.do_open(entry.inode, kill_priv, flags)?; let handle = handle.ok_or_else(ebadf)?; (entry, handle) } Ok(fd) => { // Safe because we just opened this fd. let file = RwLock::new(unsafe { File::from_raw_fd(fd) }); let entry = self.do_lookup(parent, name)?; let handle = self.next_handle.fetch_add(1, Ordering::Relaxed); let data = HandleData { inode: entry.inode, file, }; self.handles.write().unwrap().insert(handle, Arc::new(data)); (entry, handle) } }; let mut opts = OpenOptions::empty(); match self.cfg.cache_policy { CachePolicy::Never => opts |= OpenOptions::DIRECT_IO, CachePolicy::Metadata => opts |= OpenOptions::DIRECT_IO, CachePolicy::Always => opts |= OpenOptions::KEEP_CACHE, _ => {} }; Ok((entry, Some(handle), opts)) } fn unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> { self.do_unlink(parent, name, 0) } fn setupmapping( &self, _ctx: Context, inode: Inode, _handle: Handle, foffset: u64, len: u64, flags: u64, moffset: u64, vu_req: &mut T, ) -> io::Result<()> { debug!( "setupmapping: ino {:?} foffset {} len {} flags {} moffset {}", inode, foffset, len, flags, moffset ); let open_flags = if (flags & fuse::SetupmappingFlags::WRITE.bits()) != 0 { libc::O_RDWR } else { libc::O_RDONLY }; let file = self.open_inode(inode, open_flags)?; (*vu_req).map(foffset, moffset, len, flags, file.as_raw_fd()) } fn removemapping( &self, _ctx: Context, requests: Vec, vu_req: &mut T, ) -> io::Result<()> { (*vu_req).unmap(requests) } fn read( &self, _ctx: Context, inode: Inode, handle: Handle, mut w: W, size: u32, offset: u64, _lock_owner: Option, _flags: u32, ) -> io::Result { let data = self.find_handle(handle, inode)?; // This is safe because write_from uses preadv64, so the underlying file descriptor // offset is not affected by this operation. let f = data.file.read().unwrap(); w.write_from(&f, size as usize, offset) } fn write( &self, _ctx: Context, inode: Inode, handle: Handle, mut r: R, size: u32, offset: u64, _lock_owner: Option, delayed_write: bool, kill_priv: bool, flags: u32, ) -> io::Result { let data = self.find_handle(handle, inode)?; // This is safe because read_to uses `pwritev2(2)`, so the underlying file descriptor // offset is not affected by this operation. let f = data.file.read().unwrap(); { let _killpriv_guard = if self.cfg.killpriv_v2 && kill_priv { // We need to drop FSETID during a write so that the kernel will remove setuid // or setgid bits from the file if it was written to by someone other than the // owner. drop_effective_cap("FSETID")? } else { None }; self.clear_file_capabilities(f.as_raw_fd(), false)?; // We don't set the `RWF_APPEND` (i.e., equivalent to `O_APPEND`) flag, if it's a // delayed write (i.e., using writeback mode or a mem mapped file) even if the file // was open in append mode, since the guest kernel sends the correct offset. // For non-delayed writes, we set the append mode, if necessary, to correctly handle // writes on a file shared among VMs. This case can only be handled correctly if the // write on the underlying file is performed in append mode. let is_append = flags & libc::O_APPEND as u32 != 0; let flags = (!delayed_write && is_append).then_some(oslib::WritevFlags::RWF_APPEND); r.read_to(&f, size as usize, offset, flags) } } fn getattr( &self, _ctx: Context, inode: Inode, _handle: Option, ) -> io::Result<(libc::stat64, Duration)> { self.do_getattr(inode) } fn setattr( &self, _ctx: Context, inode: Inode, attr: libc::stat64, handle: Option, valid: SetattrValid, ) -> io::Result<(libc::stat64, Duration)> { let inode_data = self .inodes .read() .unwrap() .get(&inode) .map(Arc::clone) .ok_or_else(ebadf)?; // In this case, we need to open a new O_RDWR FD let rdwr_inode_file = handle.is_none() && valid.intersects(SetattrValid::SIZE); let inode_file = if rdwr_inode_file { inode_data.open_file(libc::O_NONBLOCK | libc::O_RDWR, &self.proc_self_fd)? } else { inode_data.get_file()? }; enum Data { Handle(Arc, RawFd), ProcPath(CString), } // If we have a handle then use it otherwise get a new fd from the inode. let data = if let Some(handle) = handle { let hd = self.find_handle(handle, inode)?; let fd = hd.file.write().unwrap().as_raw_fd(); Data::Handle(hd, fd) } else { let pathname = CString::new(format!("{}", inode_file.as_raw_fd())) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; Data::ProcPath(pathname) }; if valid.contains(SetattrValid::MODE) { // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { match data { Data::Handle(_, fd) => libc::fchmod(fd, attr.st_mode), Data::ProcPath(ref p) => { libc::fchmodat(self.proc_self_fd.as_raw_fd(), p.as_ptr(), attr.st_mode, 0) } } }; if res < 0 { return Err(io::Error::last_os_error()); } } if valid.intersects(SetattrValid::UID | SetattrValid::GID) { let uid = if valid.contains(SetattrValid::UID) { attr.st_uid } else { // Cannot use -1 here because these are unsigned values. u32::MAX }; let gid = if valid.contains(SetattrValid::GID) { attr.st_gid } else { // Cannot use -1 here because these are unsigned values. u32::MAX }; self.clear_file_capabilities(inode_file.as_raw_fd(), true)?; // Safe because this is a constant value and a valid C string. let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::fchownat( inode_file.as_raw_fd(), empty.as_ptr(), uid, gid, libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW, ) }; if res < 0 { return Err(io::Error::last_os_error()); } } if valid.contains(SetattrValid::SIZE) { let fd = match data { Data::Handle(_, fd) => fd, _ => { // Should have opened an O_RDWR inode_file above assert!(rdwr_inode_file); inode_file.as_raw_fd() } }; let _killpriv_guard = if self.cfg.killpriv_v2 && valid.contains(SetattrValid::KILL_SUIDGID) { drop_effective_cap("FSETID")? } else { None }; // Safe because this doesn't modify any memory and we check the return value. let res = self .clear_file_capabilities(fd, false) .map(|_| unsafe { libc::ftruncate(fd, attr.st_size) })?; if res < 0 { return Err(io::Error::last_os_error()); } } if valid.intersects(SetattrValid::ATIME | SetattrValid::MTIME) { let mut tvs = [ libc::timespec { tv_sec: 0, tv_nsec: libc::UTIME_OMIT, }, libc::timespec { tv_sec: 0, tv_nsec: libc::UTIME_OMIT, }, ]; if valid.contains(SetattrValid::ATIME_NOW) { tvs[0].tv_nsec = libc::UTIME_NOW; } else if valid.contains(SetattrValid::ATIME) { tvs[0].tv_sec = attr.st_atime; tvs[0].tv_nsec = attr.st_atime_nsec; } if valid.contains(SetattrValid::MTIME_NOW) { tvs[1].tv_nsec = libc::UTIME_NOW; } else if valid.contains(SetattrValid::MTIME) { tvs[1].tv_sec = attr.st_mtime; tvs[1].tv_nsec = attr.st_mtime_nsec; } // Safe because this doesn't modify any memory and we check the return value. let res = match data { Data::Handle(_, fd) => unsafe { libc::futimens(fd, tvs.as_ptr()) }, Data::ProcPath(ref p) => unsafe { libc::utimensat(self.proc_self_fd.as_raw_fd(), p.as_ptr(), tvs.as_ptr(), 0) }, }; if res < 0 { return Err(io::Error::last_os_error()); } } self.do_getattr(inode) } fn rename( &self, _ctx: Context, olddir: Inode, oldname: &CStr, newdir: Inode, newname: &CStr, flags: u32, ) -> io::Result<()> { let old_inode = self .inodes .read() .unwrap() .get(&olddir) .map(Arc::clone) .ok_or_else(ebadf)?; let new_inode = self .inodes .read() .unwrap() .get(&newdir) .map(Arc::clone) .ok_or_else(ebadf)?; let old_file = old_inode.get_file()?; let new_file = new_inode.get_file()?; // Safe because this doesn't modify any memory and we check the return value. // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands // and we have glibc 2.28. let res = unsafe { libc::syscall( libc::SYS_renameat2, old_file.as_raw_fd(), oldname.as_ptr(), new_file.as_raw_fd(), newname.as_ptr(), flags, ) }; if res == 0 { Ok(()) } else { Err(io::Error::last_os_error()) } } fn mknod( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, rdev: u32, umask: u32, extensions: Extensions, ) -> io::Result { let data = self .inodes .read() .unwrap() .get(&parent) .map(Arc::clone) .ok_or_else(ebadf)?; let parent_file = data.get_file()?; let res = { let _credentials_guard = UnixCredentials::new(ctx.uid, ctx.gid) .supplementary_gid( self.sup_group_extension.load(Ordering::Relaxed), extensions.sup_gid, ) .set()?; let _umask_guard = self .posix_acl .load(Ordering::Relaxed) .then(|| oslib::ScopedUmask::new(umask)); // Safe because this doesn't modify any memory and we check the return value. unsafe { libc::mknodat( parent_file.as_raw_fd(), name.as_ptr(), mode as libc::mode_t, u64::from(rdev), ) } }; if res < 0 { return Err(io::Error::last_os_error()); } // Set security context on node. if let Some(secctx) = extensions.secctx { if let Err(e) = self.do_mknod_mkdir_symlink_secctx(&parent_file, name, &secctx) { unsafe { libc::unlinkat(parent_file.as_raw_fd(), name.as_ptr(), 0); }; return Err(e); } } self.do_lookup(parent, name) } fn link( &self, _ctx: Context, inode: Inode, newparent: Inode, newname: &CStr, ) -> io::Result { let data = self .inodes .read() .unwrap() .get(&inode) .map(Arc::clone) .ok_or_else(ebadf)?; let new_inode = self .inodes .read() .unwrap() .get(&newparent) .map(Arc::clone) .ok_or_else(ebadf)?; let inode_file = data.get_file()?; let newparent_file = new_inode.get_file()?; let procname = CString::new(format!("{}", inode_file.as_raw_fd())) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::linkat( self.proc_self_fd.as_raw_fd(), procname.as_ptr(), newparent_file.as_raw_fd(), newname.as_ptr(), libc::AT_SYMLINK_FOLLOW, ) }; if res == 0 { self.do_lookup(newparent, newname) } else { Err(io::Error::last_os_error()) } } fn symlink( &self, ctx: Context, linkname: &CStr, parent: Inode, name: &CStr, extensions: Extensions, ) -> io::Result { let data = self .inodes .read() .unwrap() .get(&parent) .map(Arc::clone) .ok_or_else(ebadf)?; let parent_file = data.get_file()?; let res = { let _credentials_guard = UnixCredentials::new(ctx.uid, ctx.gid) .supplementary_gid( self.sup_group_extension.load(Ordering::Relaxed), extensions.sup_gid, ) .set()?; // Safe because this doesn't modify any memory and we check the return value. unsafe { libc::symlinkat(linkname.as_ptr(), parent_file.as_raw_fd(), name.as_ptr()) } }; if res < 0 { return Err(io::Error::last_os_error()); } // Set security context on symlink. if let Some(secctx) = extensions.secctx { if let Err(e) = self.do_mknod_mkdir_symlink_secctx(&parent_file, name, &secctx) { unsafe { libc::unlinkat(parent_file.as_raw_fd(), name.as_ptr(), 0); }; return Err(e); } } self.do_lookup(parent, name) } fn readlink(&self, _ctx: Context, inode: Inode) -> io::Result> { let data = self .inodes .read() .unwrap() .get(&inode) .map(Arc::clone) .ok_or_else(ebadf)?; let inode_file = data.get_file()?; let mut buf = vec![0; libc::PATH_MAX as usize]; // Safe because this is a constant value and a valid C string. let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; // Safe because this will only modify the contents of `buf` and we check the return value. let res = unsafe { libc::readlinkat( inode_file.as_raw_fd(), empty.as_ptr(), buf.as_mut_ptr() as *mut libc::c_char, buf.len(), ) }; if res < 0 { return Err(io::Error::last_os_error()); } buf.resize(res as usize, 0); Ok(buf) } fn flush( &self, _ctx: Context, inode: Inode, handle: Handle, _lock_owner: u64, ) -> io::Result<()> { let data = self.find_handle(handle, inode)?; // Since this method is called whenever an fd is closed in the client, we can emulate that // behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe // because this doesn't modify any memory and we check the return values. unsafe { let newfd = libc::dup(data.file.write().unwrap().as_raw_fd()); if newfd < 0 { return Err(io::Error::last_os_error()); } if libc::close(newfd) < 0 { Err(io::Error::last_os_error()) } else { Ok(()) } } } fn fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> { let data = self.find_handle(handle, inode)?; let fd = data.file.write().unwrap().as_raw_fd(); // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { if datasync { libc::fdatasync(fd) } else { libc::fsync(fd) } }; if res == 0 { Ok(()) } else { Err(io::Error::last_os_error()) } } fn fsyncdir( &self, ctx: Context, inode: Inode, datasync: bool, handle: Handle, ) -> io::Result<()> { self.fsync(ctx, inode, datasync, handle) } fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> { let data = self .inodes .read() .unwrap() .get(&inode) .map(Arc::clone) .ok_or_else(ebadf)?; let inode_file = data.get_file()?; let st = statx(&inode_file, None)?.st; let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK); if mode == libc::F_OK { // The file exists since we were able to call `stat(2)` on it. return Ok(()); } if (mode & libc::R_OK) != 0 && ctx.uid != 0 && (st.st_uid != ctx.uid || st.st_mode & 0o400 == 0) && (st.st_gid != ctx.gid || st.st_mode & 0o040 == 0) && st.st_mode & 0o004 == 0 { return Err(io::Error::from_raw_os_error(libc::EACCES)); } if (mode & libc::W_OK) != 0 && ctx.uid != 0 && (st.st_uid != ctx.uid || st.st_mode & 0o200 == 0) && (st.st_gid != ctx.gid || st.st_mode & 0o020 == 0) && st.st_mode & 0o002 == 0 { return Err(io::Error::from_raw_os_error(libc::EACCES)); } // root can only execute something if it is executable by one of the owner, the group, or // everyone. if (mode & libc::X_OK) != 0 && (ctx.uid != 0 || st.st_mode & 0o111 == 0) && (st.st_uid != ctx.uid || st.st_mode & 0o100 == 0) && (st.st_gid != ctx.gid || st.st_mode & 0o010 == 0) && st.st_mode & 0o001 == 0 { return Err(io::Error::from_raw_os_error(libc::EACCES)); } Ok(()) } fn setxattr( &self, ctx: Context, inode: Inode, name: &CStr, value: &[u8], flags: u32, extra_flags: SetxattrFlags, ) -> io::Result<()> { if !self.cfg.xattr { return Err(io::Error::from_raw_os_error(libc::ENOSYS)); } let data = self .inodes .read() .unwrap() .get(&inode) .map(Arc::clone) .ok_or_else(ebadf)?; let name = self.map_client_xattrname(name)?; // If we are setting posix access acl and if SGID needs to be // cleared, then switch to caller's gid and drop CAP_FSETID // and that should make sure host kernel clears SGID. // // This probably will not work when we support idmapped mounts. // In that case we will need to find a non-root gid and switch // to it. (Instead of gid in request). Fix it when we support // idmapped mounts. let xattr_name = name.as_ref().to_str().unwrap(); let _clear_sgid_guard = if self.posix_acl.load(Ordering::Relaxed) && extra_flags.contains(SetxattrFlags::SETXATTR_ACL_KILL_SGID) && xattr_name.eq("system.posix_acl_access") { let cap_guard = drop_effective_cap("FSETID")?; let credentials_guard = UnixCredentials::new(ctx.uid, ctx.gid).set()?; // If `UnixCredentials::set()` changes the effective user ID to non-zero, then the // effective set is cleared from all capabilities. When switching back to root the // permitted set is copied to the effective set. We need to keep `DAC_READ_SEARCH` // to use file handles. if self.cfg.inode_file_handles != InodeFileHandlesMode::Never { if let Err(e) = crate::util::add_cap_to_eff("DAC_READ_SEARCH") { warn!( "failed to add 'DAC_READ_SEARCH' to the effective set of capabilities: {}", e ); } } (cap_guard, credentials_guard) } else { (None, None) }; let res = if is_safe_inode(data.mode) { // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we // need to get a new fd. let file = self.open_inode(inode, libc::O_RDONLY | libc::O_NONBLOCK)?; self.clear_file_capabilities(file.as_raw_fd(), false)?; // Safe because this doesn't modify any memory and we check the return value. unsafe { libc::fsetxattr( file.as_raw_fd(), name.as_ptr(), value.as_ptr() as *const libc::c_void, value.len(), flags as libc::c_int, ) } } else { let file = data.get_file()?; self.clear_file_capabilities(file.as_raw_fd(), true)?; let procname = CString::new(format!("{}", file.as_raw_fd())) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; let _working_dir_guard = set_working_directory(self.proc_self_fd.as_raw_fd(), self.root_fd.as_raw_fd()); // Safe because this doesn't modify any memory and we check the return value. unsafe { libc::setxattr( procname.as_ptr(), name.as_ptr(), value.as_ptr() as *const libc::c_void, value.len(), flags as libc::c_int, ) } }; if res == 0 { Ok(()) } else { Err(io::Error::last_os_error()) } } fn getxattr( &self, _ctx: Context, inode: Inode, name: &CStr, size: u32, ) -> io::Result { if !self.cfg.xattr { return Err(io::Error::from_raw_os_error(libc::ENOSYS)); } let mut buf = vec![0; size as usize]; let name = self.map_client_xattrname(name).map_err(|e| { if e.kind() == ErrorKind::PermissionDenied { io::Error::from_raw_os_error(libc::ENODATA) } else { e } })?; let data = self .inodes .read() .unwrap() .get(&inode) .map(Arc::clone) .ok_or_else(ebadf)?; let res = if is_safe_inode(data.mode) { // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we // need to get a new fd. let file = self.open_inode(inode, libc::O_RDONLY | libc::O_NONBLOCK)?; // Safe because this will only modify the contents of `buf`. unsafe { libc::fgetxattr( file.as_raw_fd(), name.as_ptr(), buf.as_mut_ptr() as *mut libc::c_void, size as libc::size_t, ) } } else { let file = data.get_file()?; let procname = CString::new(format!("{}", file.as_raw_fd())) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; let _working_dir_guard = set_working_directory(self.proc_self_fd.as_raw_fd(), self.root_fd.as_raw_fd()); // Safe because this will only modify the contents of `buf`. unsafe { libc::getxattr( procname.as_ptr(), name.as_ptr(), buf.as_mut_ptr() as *mut libc::c_void, size as libc::size_t, ) } }; if res < 0 { return Err(io::Error::last_os_error()); } if size == 0 { Ok(GetxattrReply::Count(res as u32)) } else { buf.resize(res as usize, 0); Ok(GetxattrReply::Value(buf)) } } fn listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result { if !self.cfg.xattr { return Err(io::Error::from_raw_os_error(libc::ENOSYS)); } let data = self .inodes .read() .unwrap() .get(&inode) .map(Arc::clone) .ok_or_else(ebadf)?; let mut buf = vec![0; size as usize]; let res = if is_safe_inode(data.mode) { // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we // need to get a new fd. let file = self.open_inode(inode, libc::O_RDONLY | libc::O_NONBLOCK)?; // Safe because this will only modify the contents of `buf`. unsafe { libc::flistxattr( file.as_raw_fd(), buf.as_mut_ptr() as *mut libc::c_char, size as libc::size_t, ) } } else { let file = data.get_file()?; let procname = CString::new(format!("{}", file.as_raw_fd())) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; let _working_dir_guard = set_working_directory(self.proc_self_fd.as_raw_fd(), self.root_fd.as_raw_fd()); // Safe because this will only modify the contents of `buf`. unsafe { libc::listxattr( procname.as_ptr(), buf.as_mut_ptr() as *mut libc::c_char, size as libc::size_t, ) } }; if res < 0 { return Err(io::Error::last_os_error()); } if size == 0 { Ok(ListxattrReply::Count(res as u32)) } else { buf.resize(res as usize, 0); let buf = self.map_server_xattrlist(buf); Ok(ListxattrReply::Names(buf)) } } fn removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> { if !self.cfg.xattr { return Err(io::Error::from_raw_os_error(libc::ENOSYS)); } let data = self .inodes .read() .unwrap() .get(&inode) .map(Arc::clone) .ok_or_else(ebadf)?; let name = self.map_client_xattrname(name)?; let res = if is_safe_inode(data.mode) { // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we // need to get a new fd. let file = self.open_inode(inode, libc::O_RDONLY | libc::O_NONBLOCK)?; // Safe because this doesn't modify any memory and we check the return value. unsafe { libc::fremovexattr(file.as_raw_fd(), name.as_ptr()) } } else { let file = data.get_file()?; let procname = CString::new(format!("{}", file.as_raw_fd())) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; let _working_dir_guard = set_working_directory(self.proc_self_fd.as_raw_fd(), self.root_fd.as_raw_fd()); // Safe because this doesn't modify any memory and we check the return value. unsafe { libc::removexattr(procname.as_ptr(), name.as_ptr()) } }; if res == 0 { Ok(()) } else { Err(io::Error::last_os_error()) } } fn fallocate( &self, _ctx: Context, inode: Inode, handle: Handle, mode: u32, offset: u64, length: u64, ) -> io::Result<()> { let data = self.find_handle(handle, inode)?; let fd = data.file.write().unwrap().as_raw_fd(); // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::fallocate64( fd, mode as libc::c_int, offset as libc::off64_t, length as libc::off64_t, ) }; if res == 0 { Ok(()) } else { Err(io::Error::last_os_error()) } } fn lseek( &self, _ctx: Context, inode: Inode, handle: Handle, offset: u64, whence: u32, ) -> io::Result { let data = self.find_handle(handle, inode)?; let fd = data.file.write().unwrap().as_raw_fd(); // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::lseek(fd, offset as libc::off64_t, whence as libc::c_int) }; if res < 0 { Err(io::Error::last_os_error()) } else { Ok(res as u64) } } fn copyfilerange( &self, _ctx: Context, inode_in: Inode, handle_in: Handle, offset_in: u64, inode_out: Inode, handle_out: Handle, offset_out: u64, len: u64, flags: u64, ) -> io::Result { let data_in = self.find_handle(handle_in, inode_in)?; // Take just a read lock as we're not going to alter the file descriptor offset. let fd_in = data_in.file.read().unwrap().as_raw_fd(); let data_out = self.find_handle(handle_out, inode_out)?; // Take just a read lock as we're not going to alter the file descriptor offset. let fd_out = data_out.file.read().unwrap().as_raw_fd(); // Safe because this will only modify `offset_in` and `offset_out` and we check // the return value. let res = unsafe { libc::syscall( libc::SYS_copy_file_range, fd_in, &mut (offset_in as i64) as &mut _ as *mut _, fd_out, &mut (offset_out as i64) as &mut _ as *mut _, len, flags, ) }; if res < 0 { Err(io::Error::last_os_error()) } else { Ok(res as usize) } } fn syncfs(&self, _ctx: Context, inode: Inode) -> io::Result<()> { // TODO: Branch here depending on whether virtiofsd announces submounts or not. let file = self.open_inode(inode, libc::O_RDONLY | libc::O_NOFOLLOW)?; let raw_fd = file.as_raw_fd(); debug!("syncfs: inode={}, mount_fd={}", inode, raw_fd); let ret = unsafe { libc::syncfs(raw_fd) }; if ret != 0 { // Thread-safe, because errno is stored in thread-local storage. Err(io::Error::last_os_error()) } else { Ok(()) } } } virtiofsd-1.10.0/src/passthrough/mount_fd.rs000064400000000000000000000404751046102023000172450ustar 00000000000000// Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. use crate::passthrough::stat::{statx, MountId}; use std::collections::{HashMap, HashSet}; use std::ffi::CString; use std::fs::File; use std::io::{self, Read, Seek}; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use std::sync::{Arc, Mutex, RwLock, Weak}; pub struct MountFd { map: Weak>>>, mount_id: MountId, file: File, } /// This type maintains a map where each entry maps a mount ID to an open FD on that mount. Other /// code can request an `Arc` for any mount ID. A key gets added to the map, when the /// first `Arc` for that mount ID is requested. A key gets removed from the map, when the /// last `Arc` for that mount ID is dropped. That is, map entries are reference-counted /// and other code can keep an entry in the map by holding on to an `Arc`. /// /// We currently have one use case for `MountFds`: /// /// 1. Creating a file handle only returns a mount ID, but opening a file handle requires an open FD /// on the respective mount. So we look that up in the map. pub struct MountFds { map: Arc>>>, /// /proc/self/mountinfo mountinfo: Mutex, /// An optional prefix to strip from all mount points in mountinfo mountprefix: Option, /// Set of filesystems for which we have already logged file handle errors error_logged: Arc>>, } impl MountFd { pub fn file(&self) -> &File { &self.file } } /** * Error object (to be used as `Result`) for mount-point-related errors (hence MPR). * Includes a description (that is auto-generated from the `io::Error` at first), which can be * overridden with `MPRError::set_desc()`, or given a prefix with `MPRError::prefix()`. * * The full description can be retrieved through the `Display` trait implementation (or the * auto-derived `ToString`). * * `MPRError` objects should generally be logged at some point, because they may indicate an error * in the user's configuration or a bug in virtiofsd. However, we only want to log them once per * filesystem, and so they can be silenced (setting `silent` to true if we know that we have * already logged an error for the respective filesystem) and then should not be logged. * * Naturally, a "mount-point-related" error should be associated with some mount point, which is * reflected in `fs_mount_id` and `fs_mount_root`. Setting these values will improve the error * description, because the `Display` implementation will prepend these values to the returned * string. * * To achieve this association, `MPRError` objects should be created through * `MountFds::error_for()`, which obtains the mount root path for the given mount ID, and will thus * try to not only set `fs_mount_id`, but `fs_mount_root` also. `MountFds::error_for()` will also * take care to set `silent` as appropriate. * * (Sometimes, though, we know an error is associated with a mount point, but we do not know with * which one. That is why the `fs_mount_id` field is optional.) */ #[derive(Debug)] pub struct MPRError { io: io::Error, description: String, silent: bool, fs_mount_id: Option, fs_mount_root: Option, } /// Type alias for convenience pub type MPRResult = Result; impl Drop for MountFd { fn drop(&mut self) { debug!( "Dropping MountFd: mount_id={}, mount_fd={}", self.mount_id, self.file.as_raw_fd(), ); // If `self.map.upgrade()` fails, then the `MountFds` structure was dropped while there was // still an `Arc` alive. In this case, we don't need to remove it from the map, // because the map doesn't exist anymore. if let Some(map) = self.map.upgrade() { let mut map = map.write().unwrap(); // After the refcount reaches zero and before we lock the map, there's a window where // the value can be concurrently replaced by a `Weak` pointer to a new `MountFd`. // Therefore, only remove the value if the refcount in the map is zero, too. if let Some(0) = map.get(&self.mount_id).map(Weak::strong_count) { map.remove(&self.mount_id); } } } } impl> From for MPRError { /// Convert any stringifyable error object that can be converted to an `io::Error` to an /// `MPRError`. Note that `fs_mount_id` and `fs_mount_root` are not set, so this `MPRError` /// object is not associated with any mount point. /// The initial description is taken from the original error object. fn from(err: E) -> Self { let description = err.to_string(); MPRError { io: err.into(), description, silent: false, fs_mount_id: None, fs_mount_root: None, } } } impl MPRError { /// Override the current description #[must_use] pub fn set_desc(mut self, s: String) -> Self { self.description = s; self } /// Add a prefix to the description #[must_use] pub fn prefix(self, s: String) -> Self { let new_desc = format!("{}: {}", s, self.description); self.set_desc(new_desc) } /// To give additional information to the user (when this error is logged), add the mount ID of /// the filesystem associated with this error #[must_use] fn set_mount_id(mut self, mount_id: MountId) -> Self { self.fs_mount_id = Some(mount_id); self } /// To give additional information to the user (when this error is logged), add the mount root /// path for the filesystem associated with this error #[must_use] fn set_mount_root(mut self, mount_root: String) -> Self { self.fs_mount_root = Some(mount_root); self } /// Mark this error as silent (i.e. not to be logged) #[must_use] fn silence(mut self) -> Self { self.silent = true; self } /// Return whether this error is silent (i.e. should not be logged) pub fn silent(&self) -> bool { self.silent } /// Return the `io::Error` from an `MPRError` and drop the rest pub fn into_inner(self) -> io::Error { self.io } } impl std::fmt::Display for MPRError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match (self.fs_mount_id, &self.fs_mount_root) { (None, None) => write!(f, "{}", self.description), (Some(id), None) => write!(f, "Filesystem with mount ID {}: {}", id, self.description), (None, Some(root)) => write!( f, "Filesystem mounted on \"{}\": {}", root, self.description ), (Some(id), Some(root)) => write!( f, "Filesystem mounted on \"{}\" (mount ID: {}): {}", root, id, self.description ), } } } impl std::error::Error for MPRError {} impl MountFds { pub fn new(mountinfo: File, mountprefix: Option) -> Self { MountFds { map: Default::default(), mountinfo: Mutex::new(mountinfo), mountprefix, error_logged: Default::default(), } } pub fn get(&self, mount_id: MountId, reopen_fd: F) -> MPRResult> where F: FnOnce(RawFd, libc::c_int) -> io::Result, { let existing_mount_fd = self .map // The `else` branch below (where `existing_mount_fd` matches `None`) takes a write lock // to insert a new mount FD into the hash map. This doesn't deadlock, because the read // lock taken here doesn't have its lifetime extended beyond the statement, because // `Weak::upgrade` returns a new pointer and not a reference into the read lock. .read() .unwrap() .get(&mount_id) // We treat a failed upgrade just like a non-existent key, because it means that all // strong references to the `MountFd` have disappeared, so it's in the process of being // dropped, but `MountFd::drop()` just did not yet get to remove it from the map. .and_then(Weak::upgrade); let mount_fd = if let Some(mount_fd) = existing_mount_fd { mount_fd } else { // `open_by_handle_at()` needs a non-`O_PATH` fd, which we will need to open here. We // are going to open the filesystem's mount point, but we do not know whether that is a // special file[1], and we must not open special files with anything but `O_PATH`, so // we have to get some `O_PATH` fd first that we can stat to find out whether it is // safe to open. // [1] While mount points are commonly directories, it is entirely possible for a // filesystem's root inode to be a regular or even special file. let mount_point = self.get_mount_root(mount_id)?; // Clone `mount_point` so we can still use it in error messages let c_mount_point = CString::new(mount_point.clone()).map_err(|e| { self.error_for(mount_id, e) .prefix(format!("Failed to convert \"{mount_point}\" to a CString")) })?; let mount_point_fd = unsafe { libc::open(c_mount_point.as_ptr(), libc::O_PATH) }; if mount_point_fd < 0 { return Err(self .error_for(mount_id, io::Error::last_os_error()) .prefix(format!("Failed to open mount point \"{mount_point}\""))); } // Safe because we have just opened this FD let mount_point_path = unsafe { File::from_raw_fd(mount_point_fd) }; // Ensure that `mount_point_path` refers to an inode with the mount ID we need let stx = statx(&mount_point_path, None).map_err(|e| { self.error_for(mount_id, e) .prefix(format!("Failed to stat mount point \"{mount_point}\"")) })?; if stx.mnt_id != mount_id { return Err(self .error_for(mount_id, io::Error::from_raw_os_error(libc::EIO)) .set_desc(format!( "Mount point's ({}) mount ID ({}) does not match expected value ({})", mount_point, stx.mnt_id, mount_id ))); } // Ensure that we can safely reopen `mount_point_path` with `O_RDONLY` let file_type = stx.st.st_mode & libc::S_IFMT; if file_type != libc::S_IFREG && file_type != libc::S_IFDIR { return Err(self .error_for(mount_id, io::Error::from_raw_os_error(libc::EIO)) .set_desc(format!( "Mount point \"{mount_point}\" is not a regular file or directory" ))); } // Now that we know that this is a regular file or directory, really open it let file = reopen_fd( mount_point_path.as_raw_fd(), libc::O_RDONLY | libc::O_NOFOLLOW | libc::O_CLOEXEC, ) .map_err(|e| { self.error_for(mount_id, e).prefix(format!( "Failed to reopen mount point \"{mount_point}\" for reading" )) })?; let mut mount_fds_locked = self.map.write().unwrap(); // As above: by calling `and_then(Weak::upgrade)`, we treat a failed upgrade just like a // non-existent key. If the key exists but upgrade fails, then `HashMap::insert()` // below will update the value. `MountFd::drop()` takes care to only remove a `MountFd` // without strong references from the map, and hence will not touch the updated one. if let Some(mount_fd) = mount_fds_locked.get(&mount_id).and_then(Weak::upgrade) { // A mount FD was added concurrently while we did not hold a lock on // `mount_fds.map` -- use that entry (`file` will be dropped). mount_fd } else { debug!( "Creating MountFd: mount_id={}, mount_fd={}", mount_id, file.as_raw_fd(), ); let mount_fd = Arc::new(MountFd { map: Arc::downgrade(&self.map), mount_id, file, }); mount_fds_locked.insert(mount_id, Arc::downgrade(&mount_fd)); mount_fd } }; Ok(mount_fd) } /// Given a mount ID, return the mount root path (by reading `/proc/self/mountinfo`) fn get_mount_root(&self, mount_id: MountId) -> MPRResult { let mountinfo = { let mountinfo_file = &mut *self.mountinfo.lock().unwrap(); mountinfo_file.rewind().map_err(|e| { self.error_for_nolookup(mount_id, e) .prefix("Failed to access /proc/self/mountinfo".into()) })?; let mut mountinfo = String::new(); mountinfo_file.read_to_string(&mut mountinfo).map_err(|e| { self.error_for_nolookup(mount_id, e) .prefix("Failed to read /proc/self/mountinfo".into()) })?; mountinfo }; let path = mountinfo.split('\n').find_map(|line| { let mut columns = line.split(char::is_whitespace); if columns.next()?.parse::().ok()? != mount_id { return None; } // Skip parent mount ID, major:minor device ID, and the root within the filesystem // (to get to the mount path) columns.nth(3) }); match path { Some(p) => { let p = String::from(p); if let Some(prefix) = self.mountprefix.as_ref() { if let Some(suffix) = p.strip_prefix(prefix).filter(|s| !s.is_empty()) { Ok(suffix.into()) } else { // The shared directory is the mount point (strip_prefix() returned "") or // mount is outside the shared directory, so it must be the mount the root // directory is on Ok("/".into()) } } else { Ok(p) } } None => Err(self .error_for_nolookup(mount_id, io::Error::from_raw_os_error(libc::EINVAL)) .set_desc(format!("Failed to find mount root for mount ID {mount_id}"))), } } /// Generate an `MPRError` object for the given `mount_id`, and silence it if we have already /// generated such an object for that `mount_id`. /// (Called `..._nolookup`, because in contrast to `MountFds::error_for()`, this method will /// not try to look up the respective mount root path, and so is safe to call when such a /// lookup would be unwise.) fn error_for_nolookup>( &self, mount_id: MountId, err: E, ) -> MPRError { let err = MPRError::from(err).set_mount_id(mount_id); if self.error_logged.read().unwrap().contains(&mount_id) { err.silence() } else { self.error_logged.write().unwrap().insert(mount_id); err } } /// Call `self.error_for_nolookup()`, and if the `MPRError` object is not silenced, try to /// obtain the mount root path for the given `mount_id` and add it to the error object. /// (Note: DO NOT call this method from `MountFds::get_mount_root()`, because that may lead to /// an infinite loop.) pub fn error_for>(&self, mount_id: MountId, err: E) -> MPRError { let err = self.error_for_nolookup(mount_id, err); if err.silent() { // No need to add more information err } else { // This just adds some information, so ignore errors if let Ok(mount_root) = self.get_mount_root(mount_id) { err.set_mount_root(mount_root) } else { err } } } } virtiofsd-1.10.0/src/passthrough/stat/file_status.rs000064400000000000000000000026271046102023000207240ustar 00000000000000// SPDX-License-Identifier: BSD-3-Clause #[cfg(target_env = "gnu")] pub use libc::statx as statx_st; #[cfg(target_env = "gnu")] pub use libc::{STATX_BASIC_STATS, STATX_MNT_ID}; // musl provides the 'struct statx', but without stx_mnt_id. // However, the libc crate does not provide libc::statx // if musl is used. So we add just the required struct and // constants to make it works. #[cfg(not(target_env = "gnu"))] #[repr(C)] pub struct statx_st_timestamp { pub tv_sec: i64, pub tv_nsec: u32, pub __statx_timestamp_pad1: [i32; 1], } #[cfg(not(target_env = "gnu"))] #[repr(C)] pub struct statx_st { pub stx_mask: u32, pub stx_blksize: u32, pub stx_attributes: u64, pub stx_nlink: u32, pub stx_uid: u32, pub stx_gid: u32, pub stx_mode: u16, __statx_pad1: [u16; 1], pub stx_ino: u64, pub stx_size: u64, pub stx_blocks: u64, pub stx_attributes_mask: u64, pub stx_atime: statx_st_timestamp, pub stx_btime: statx_st_timestamp, pub stx_ctime: statx_st_timestamp, pub stx_mtime: statx_st_timestamp, pub stx_rdev_major: u32, pub stx_rdev_minor: u32, pub stx_dev_major: u32, pub stx_dev_minor: u32, pub stx_mnt_id: u64, __statx_pad2: u64, __statx_pad3: [u64; 12], } #[cfg(not(target_env = "gnu"))] pub const STATX_BASIC_STATS: libc::c_uint = 0x07ff; #[cfg(not(target_env = "gnu"))] pub const STATX_MNT_ID: libc::c_uint = 0x1000; virtiofsd-1.10.0/src/passthrough/stat.rs000064400000000000000000000120231046102023000163710ustar 00000000000000// Copyright 2021 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use std::ffi::CStr; use std::io; use std::mem::MaybeUninit; use std::os::unix::io::AsRawFd; mod file_status; use crate::oslib; use file_status::{statx_st, STATX_BASIC_STATS, STATX_MNT_ID}; const EMPTY_CSTR: &[u8] = b"\0"; pub type MountId = u64; pub struct StatExt { pub st: libc::stat64, pub mnt_id: MountId, } /* * Fields in libc::statx are only valid if their respective flag in * .stx_mask is set. This trait provides functions that allow safe * access to the libc::statx components we are interested in. * * (The implementations of these functions need to check whether the * associated flag is set, and then extract the respective information * to return it.) */ trait SafeStatXAccess { fn stat64(&self) -> Option; fn mount_id(&self) -> Option; } impl SafeStatXAccess for statx_st { fn stat64(&self) -> Option { fn makedev(maj: libc::c_uint, min: libc::c_uint) -> libc::dev_t { libc::makedev(maj, min) } if self.stx_mask & STATX_BASIC_STATS != 0 { /* * Unfortunately, we cannot use an initializer to create the * stat64 object, because it may contain padding and reserved * fields (depending on the architecture), and it does not * implement the Default trait. * So we take a zeroed struct and set what we can. * (Zero in all fields is wrong, but safe.) */ let mut st = unsafe { MaybeUninit::::zeroed().assume_init() }; st.st_dev = makedev(self.stx_dev_major, self.stx_dev_minor); st.st_ino = self.stx_ino; st.st_mode = self.stx_mode as _; st.st_nlink = self.stx_nlink as _; st.st_uid = self.stx_uid; st.st_gid = self.stx_gid; st.st_rdev = makedev(self.stx_rdev_major, self.stx_rdev_minor); st.st_size = self.stx_size as _; st.st_blksize = self.stx_blksize as _; st.st_blocks = self.stx_blocks as _; st.st_atime = self.stx_atime.tv_sec; st.st_atime_nsec = self.stx_atime.tv_nsec as _; st.st_mtime = self.stx_mtime.tv_sec; st.st_mtime_nsec = self.stx_mtime.tv_nsec as _; st.st_ctime = self.stx_ctime.tv_sec; st.st_ctime_nsec = self.stx_ctime.tv_nsec as _; Some(st) } else { None } } fn mount_id(&self) -> Option { if self.stx_mask & STATX_MNT_ID != 0 { Some(self.stx_mnt_id) } else { None } } } fn get_mount_id(dir: &impl AsRawFd, path: &CStr) -> Option { let mut mount_id: libc::c_int = 0; let mut c_fh = oslib::CFileHandle::default(); oslib::name_to_handle_at(dir, path, &mut c_fh, &mut mount_id, libc::AT_EMPTY_PATH) .ok() .and(Some(mount_id as MountId)) } // Only works on Linux, and libc::SYS_statx is only defined for these // environments /// Performs a statx() syscall. libc provides libc::statx() that does /// the same, however, the system's libc may not have a statx() wrapper /// (e.g. glibc before 2.28), so linking to it may fail. /// libc::syscall() and libc::SYS_statx are always present, though, so /// we can safely rely on them. unsafe fn do_statx( dirfd: libc::c_int, pathname: *const libc::c_char, flags: libc::c_int, mask: libc::c_uint, statxbuf: *mut statx_st, ) -> libc::c_int { libc::syscall(libc::SYS_statx, dirfd, pathname, flags, mask, statxbuf) as libc::c_int } // Real statx() that depends on do_statx() pub fn statx(dir: &impl AsRawFd, path: Option<&CStr>) -> io::Result { let mut stx_ui = MaybeUninit::::zeroed(); // Safe because this is a constant value and a valid C string. let path = path.unwrap_or_else(|| unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }); // Safe because the kernel will only write data in `stx_ui` and we // check the return value. let res = unsafe { do_statx( dir.as_raw_fd(), path.as_ptr(), libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW, STATX_BASIC_STATS | STATX_MNT_ID, stx_ui.as_mut_ptr(), ) }; if res >= 0 { // Safe because we are only going to use the SafeStatXAccess // trait methods let stx = unsafe { stx_ui.assume_init() }; // if `statx()` doesn't provide the mount id (before kernel 5.8), // let's try `name_to_handle_at()`, if everything fails just use 0 let mnt_id = stx .mount_id() .or_else(|| get_mount_id(dir, path)) .unwrap_or(0); Ok(StatExt { st: stx .stat64() .ok_or_else(|| io::Error::from_raw_os_error(libc::ENOSYS))?, mnt_id, }) } else { Err(io::Error::last_os_error()) } } virtiofsd-1.10.0/src/passthrough/util.rs000064400000000000000000000040411046102023000163740ustar 00000000000000// Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. use std::ffi::CString; use std::fs::File; use std::io; use std::os::unix::io::{AsRawFd, FromRawFd}; /// Safe wrapper around libc::openat(). pub fn openat(dir_fd: &impl AsRawFd, path: &str, flags: libc::c_int) -> io::Result { let path_cstr = CString::new(path).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; // Safe because: // - CString::new() has returned success and thus guarantees `path_cstr` is a valid // NUL-terminated string // - this does not modify any memory // - we check the return value // We do not check `flags` because if the kernel cannot handle poorly specified flags then we // have much bigger problems. let fd = unsafe { libc::openat(dir_fd.as_raw_fd(), path_cstr.as_ptr(), flags) }; if fd >= 0 { // Safe because we just opened this fd Ok(unsafe { File::from_raw_fd(fd) }) } else { Err(io::Error::last_os_error()) } } /// Open `/proc/self/fd/{fd}` with the given flags to effectively duplicate the given `fd` with new /// flags (e.g. to turn an `O_PATH` file descriptor into one that can be used for I/O). pub fn reopen_fd_through_proc( fd: &impl AsRawFd, flags: libc::c_int, proc_self_fd: &File, ) -> io::Result { // Clear the `O_NOFOLLOW` flag if it is set since we need to follow the `/proc/self/fd` symlink // to get the file. openat( proc_self_fd, format!("{}", fd.as_raw_fd()).as_str(), flags & !libc::O_NOFOLLOW, ) } /// Returns true if it's safe to open this inode without O_PATH. pub fn is_safe_inode(mode: u32) -> bool { // Only regular files and directories are considered safe to be opened from the file // server without O_PATH. matches!(mode & libc::S_IFMT, libc::S_IFREG | libc::S_IFDIR) } pub fn ebadf() -> io::Error { io::Error::from_raw_os_error(libc::EBADF) } pub fn einval() -> io::Error { io::Error::from_raw_os_error(libc::EINVAL) } virtiofsd-1.10.0/src/passthrough/xattrmap.rs000064400000000000000000000677571046102023000173060ustar 00000000000000//! The `xattrmap` module is used to translate extended attribute operations //! between the server (virtiofsd) and the client (the virtio-fs guest kernel //! module). //! //! Here's a non-exhaustive list of use-cases in which it may be beneficial to //! install an extended attribute mapping: //! //! * The guest VM process is executing at a privilege level where it can't //! actually modify the extended attribute on the host. In this case, one //! may choose to map those guest's extended attributes to a "user." //! namespace on the host. //! //! * An extended attribute mapping can partition a host's extended attributes //! from a guest's to prevent the guest from clobbering extended attributes //! that the host has set and depends on. //! //! ## Rules //! //! The entity that launches virtiofsd may provide an "extended attributes //! mapping" (or "xattrmap") that defines how extended attributes should be //! translated. An xattrmap is really just a series of rules with a specific //! syntax. When translating an xattr, the xattrmap rules are traversed in the //! order that the mappings were originally written. The traversal is terminated //! on the first rule that matches the xattr. //! //! The xattrmap _must_ have a terminating rule. //! //! ### Reference //! //! There are two ways of expressing an xattrmap rule: //! //! 1. `:type:scope:key:prepend:` //! 2. `:map:key:prepend:` - this is just syntactic sugar for expressing a common //! rule. It is equivalent to `:prefix:all:key:prepend`. //! //! An xattrmap is just a series of these rules separated by whitespace. Each rule //! can have its own delimiter. The colon (`:`) was just used here as an arbitary //! example. Use a delimiter that you find readable. //! //! Let's dissect the xattrmap rule syntax: `:type:scope:key:prepend:`. //! //! | type | description | //! | - | - | //! | prefix | The value of `key` is prepended to xattrs originating from the client (i.e., `{get,set,remove}xattr()`). The value of `prepend` is stripped from the server's reply to `listxattr()`. | //! | ok | If the xattr originating from the client is prefixed with `key`, or if an xattr in a server reply is prefixed with `prepend` it passes through unchanged. | //! | bad | If the xattr originating from the client is prefixed with `key` it is denied with `EPERM`. If the xattr in a server reply is prefixed with `prepend` it is hidden from the client and not included in the reply. | //! | unsupported | If a client tries to use a name matching 'key' it's denied using ENOTSUP; when the server passes an attribute name matching 'prepend' it's hidden. In many ways its use is very like 'ok' as either an explicit terminator or for special handling of certain patterns. | //! //! `ok` and `bad` can both be used as simple terminators for an xattrmap to //! satisfy the expectation that every xattrmap has a terminator. For example, //! `:ok:all:::`, will vacuously terminate all mappings. Placing a rule like //! this at the end of the xattrmap rules is a common way of providing a //! terminator. //! //! | scope | description | //! | - | - | //! | server | Match on xattrnames in the server reply that are prefixed with `prepend`. | //! | client | Match on xattrnames from the client that are prefixed with `key`. | //! | all | Matches on both server replies and client requests as described for `server` and `client` scopes. | //! //! ### Examples //! //! These have been taken almost verbatim from the original virtiofsd //! documentation in the QEMU source code. //! //! #### Example 1 //! //! ```text //! :prefix:all::user.virtiofs.: //! :bad:all::: //! ``` //! //! There are two rules in this xattrmap. The first rule prefixes and strips //! `user.virtiofs.` from client requests and server replies respectively. //! //! The second rule hides any non-prefixed extended attributes that the host //! set. //! //! #### Example 2 //! //! ```text //! /prefix/all/trusted./user.virtiofs./ //! /bad/server//trusted./ //! /bad/client/user.virtiofs.// //! /ok/all/// //! ``` //! //! The first rule prefixes client xattrnames with `trusted.` and strips //! `user.virtiofs.` from xattrnames included in the server reply. //! //! The second rule hides unprefixed `trusted.` attributes on the host. //! //! The third rule prevents the guest from manipulating the `user.virtiofs.` //! namespace directly. //! //! The final rule is the terminator and allows all remaining attributes //! through unchanged. #![deny(missing_docs)] use std::borrow::Cow; use std::convert::TryFrom; use std::ffi::{CStr, CString}; use std::fmt; use std::iter::Peekable; /// Expected error conditions with respect to parsing an XattrMap or /// attempting to match on a rule. #[derive(Debug, Eq, PartialEq)] pub enum ErrorKind { /// Scope is not one of: "all", "server", "client". InvalidScope { /// The unexpected value parsed from the input stream. got: String, /// A list of the expected values. expected: String, }, /// Type is not one of "prefix", "ok", "bad", or "map". InvalidType { /// The unexpected value parsed from the input stream. got: String, /// A list of the expected values. expected: String, }, /// A delimiter has been found that does not match the delimiter /// the rule started with. InvalidDelimiter, /// The rule is missing fields. IncompleteRule, /// There may only be one `map` rule and it must be the final /// rule; if this error is returned, then multiple map rules /// exist or one exists and it is not the final rule. MapRuleViolation, /// The input stream doesn't contain any rules. NoRulesProvided, /// None of the rules matched on the input. UnterminatedMapping, } impl std::error::Error for ErrorKind {} impl fmt::Display for ErrorKind { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{self:?}") } } /// Errors specific to XattrMap operations. #[derive(Debug, Eq, PartialEq)] pub struct Error { /// The specific error condition that was detected. pub cause: ErrorKind, /// The culpable rule, if any. pub rule: Option, } impl std::error::Error for Error { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { Some(&self.cause) } } impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{self:?}") } } impl From for Error { fn from(ek: ErrorKind) -> Self { Self { cause: ek, rule: None, } } } bitflags::bitflags! { struct Scope: u8 { const CLIENT = 0b01; const SERVER = 0b10; } } impl Scope { fn from_bytes>(bytes: B) -> Result { let bytes = bytes.as_ref(); Ok(match bytes { b"all" => Scope::CLIENT | Scope::SERVER, b"client" => Scope::CLIENT, b"server" => Scope::SERVER, _ => { return Err(ErrorKind::InvalidScope { got: String::from_utf8_lossy(bytes).into(), expected: ["all", "client", "server"].join(", "), }) } }) } } #[derive(Copy, Clone, Debug, Eq, PartialEq)] enum Type { Prefix, Okay, Bad, Unsupported, Map, } impl Type { fn from_bytes>(bytes: B) -> Result { let bytes = bytes.as_ref(); Ok(match bytes { b"prefix" => Type::Prefix, b"ok" => Type::Okay, b"bad" => Type::Bad, b"unsupported" => Type::Unsupported, b"map" => Type::Map, _ => { return Err(ErrorKind::InvalidType { got: String::from_utf8_lossy(bytes).into(), expected: ["prefix", "ok", "bad", "map"].join(", "), }) } }) } } #[derive(Clone, Debug, Eq, PartialEq)] struct Rule { scope: Scope, type_: Type, key: CString, prepend: CString, } impl Rule { fn matches(&self, scope: Scope, xattr_name: &[u8]) -> bool { if !self.scope.contains(scope) { return false; } match scope { Scope::CLIENT => xattr_name.starts_with(self.key.to_bytes()), Scope::SERVER => xattr_name.starts_with(self.prepend.to_bytes()), _ => panic!("ambiguous scope"), } } fn from_tokens(tokens: &mut Peekable) -> Result where I: Iterator, { // The caller has already trimmed the whitespace leading up to here, // so the next element should be a rule delimiter. let delim = tokens.next().ok_or(ErrorKind::InvalidDelimiter)?; // This exists instead of using take_while() because take_while() will // consume the delimiter (if it exists) and it won't complain if it doesn't // exist. That means that we wouldn't be able to check for an unterminated // rule error like this: // :prefix:all:trusted.:user.vm. // ^ missing ':' let mut next_token = || { let mut bytes = vec![]; loop { if let Some(ch) = tokens.peek() { if !ch.eq(&delim) { bytes.push(*ch as u8); let _ = tokens.next(); } else { // advance past delimiter let _ = tokens.next(); break; } } else { // Ran out of tokens without finding a terminating delimiter return Err(ErrorKind::IncompleteRule); } } Ok(bytes) }; let type_ = Type::from_bytes(next_token()?)?; Ok(match type_ { Type::Map => Rule { type_, scope: Scope::CLIENT | Scope::SERVER, key: CString::new(next_token()?).unwrap(), prepend: CString::new(next_token()?).unwrap(), }, Type::Prefix | Type::Okay | Type::Bad | Type::Unsupported => { let scope = Scope::from_bytes(next_token()?)?; Rule { type_, scope, key: CString::new(next_token()?).unwrap(), prepend: CString::new(next_token()?).unwrap(), } } }) } fn expand_map_type(rule: Self) -> Vec { assert_eq!(rule.type_, Type::Map); // 1st: Prefix matches/everything let mut rules = vec![Rule { type_: Type::Prefix, scope: Scope::CLIENT | Scope::SERVER, key: rule.key.clone(), prepend: rule.prepend.clone(), }]; let last_rule_type = if !rule.key.as_bytes().is_empty() { // 2nd: Hide non-prefixed but matching entries on the host, and // stop the client accessing prefixed attributes directly rules.push(Rule { type_: Type::Bad, scope: Scope::CLIENT | Scope::SERVER, key: rule.prepend, prepend: rule.key, }); Type::Okay } else { Type::Bad }; // Last: Everything else rules.push(Rule { type_: last_rule_type, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("").unwrap(), }); rules } } /// A return value that indicates the xattr name input has passed through /// the XattrMap where a rule was successfully matched and applied to the /// xattrname. #[derive(Debug, Eq, PartialEq)] pub enum AppliedRule<'a> { /// Server should the interior value onward through to the requested operation. Pass(Cow<'a, CStr>), /// Server should return EPERM (i.e., this matched on a `bad` rule). Deny, /// Server should return ENOTSUP (i.e., this matched on a `unsupported` rule). Unsupported, } /// A collection of well-formed xattr translation rules. #[derive(Clone, Debug, Eq, PartialEq)] pub struct XattrMap { rules: Vec, } impl XattrMap { /// Applies xattrmap rules to a single extended attribute name. /// /// This should be called *before* any other extended attribute /// operation is performed on the host file system. /// /// Client request -> this method -> {get,set,remove}xattr() -> server response /// /// See also: getxattr(2), setxattr(2), removexattr(2) pub fn map_client_xattr<'a>(&self, xattr_name: &'a CStr) -> Result, Error> { let rule = self.find_rule(Scope::CLIENT, xattr_name.to_bytes())?; Ok(match rule.type_ { Type::Okay => AppliedRule::Pass(Cow::Borrowed(xattr_name)), Type::Bad => AppliedRule::Deny, Type::Unsupported => AppliedRule::Unsupported, Type::Prefix => { let mut concat = rule.prepend.as_bytes().to_vec(); concat.extend_from_slice(xattr_name.to_bytes()); AppliedRule::Pass(Cow::Owned(CString::new(concat).unwrap())) } Type::Map => panic!("Unexpanded MAP rule was found."), }) } /// Applies xattrmap rules to a list of extended attribute names. /// /// This should be called *before* replying to the client with the list /// of extended attribute names. /// /// Client request -> listxattr() -> this method -> server response /// /// See also: listxattr(2) pub fn map_server_xattrlist(&self, xattr_names: Vec) -> Result, Error> { let mut filtered = Vec::with_capacity(xattr_names.len()); let unprocessed = xattr_names.split(|b| *b == 0).filter(|bs| !bs.is_empty()); for xattr_name in unprocessed { let rule = self.find_rule(Scope::SERVER, xattr_name)?; let processed = match rule.type_ { Type::Bad | Type::Unsupported => continue, // hide this from the client Type::Okay => xattr_name, Type::Prefix => &xattr_name[rule.prepend.as_bytes().len()..], // strip prefix Type::Map => panic!("Unexpanded MAP rule was found."), }; filtered.extend_from_slice(processed); filtered.push(0); } if filtered.is_empty() { filtered.push(0); } filtered.shrink_to_fit(); Ok(filtered) } fn find_rule(&self, scope: Scope, xattr_name: &[u8]) -> Result<&Rule, Error> { let rule = self .rules .iter() .find(|r| r.matches(scope, xattr_name)) .ok_or(ErrorKind::UnterminatedMapping) .map_err(|e| Error { cause: e, rule: None, })?; Ok(rule) } } impl TryFrom<&str> for XattrMap { type Error = Error; fn try_from(input: &str) -> Result { let trimmed = input.trim(); let mut unparsed = trimmed.chars().peekable(); let mut rules: Vec = vec![]; while unparsed.peek().is_some() { // Skip any whitespace between rules if let Some(ch) = unparsed.peek() { if ch.is_ascii_whitespace() { let _ = unparsed.next(); continue; } } let rule = Rule::from_tokens(&mut unparsed).map_err(|e| Error { cause: e, rule: Some(rules.len() + 1), })?; if rule.type_ == Type::Map { // There may only be one 'map' rule and it must be the final rule if unparsed.peek().is_some() { return Err(Error { rule: Some(rules.len() + 1), cause: ErrorKind::MapRuleViolation, }); } rules.append(&mut Rule::expand_map_type(rule)); } else { rules.push(rule); }; } if rules.is_empty() { return Err(ErrorKind::NoRulesProvided.into()); } Ok(Self { rules }) } } #[cfg(test)] mod tests { use super::*; #[test] fn test_parser_can_parse_single_rule() { let input = ":prefix:client:trusted.:user.virtiofs.:"; let actual = XattrMap::try_from(input).unwrap(); let expected = XattrMap { rules: vec![Rule { type_: Type::Prefix, scope: Scope::CLIENT, key: CString::new("trusted.").unwrap(), prepend: CString::new("user.virtiofs.").unwrap(), }], }; assert_eq!(actual, expected); } #[test] fn test_parser_can_parse_multiple_valid_rules() { let input = ":prefix:all::user.virtiofs.::bad:all:::"; let actual = XattrMap::try_from(input).unwrap(); let expected = XattrMap { rules: vec![ Rule { type_: Type::Prefix, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("user.virtiofs.").unwrap(), }, Rule { type_: Type::Bad, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("").unwrap(), }, ], }; assert_eq!(actual, expected); } #[test] fn test_parser_can_parse_rules_separated_by_whitespace() { let input = r#" /prefix/all/trusted./user.virtiofs./ /bad/server//trusted./ /bad/client/user.virtiofs.// /ok/all/// "#; let actual = XattrMap::try_from(input).unwrap(); let expected = XattrMap { rules: vec![ Rule { type_: Type::Prefix, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("trusted.").unwrap(), prepend: CString::new("user.virtiofs.").unwrap(), }, Rule { type_: Type::Bad, scope: Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("trusted.").unwrap(), }, Rule { type_: Type::Bad, scope: Scope::CLIENT, key: CString::new("user.virtiofs.").unwrap(), prepend: CString::new("").unwrap(), }, Rule { type_: Type::Okay, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("").unwrap(), }, ], }; assert_eq!(actual, expected); } #[test] fn test_parser_emits_incomplete_rule_error() { let input = ":prefix:client:hi"; let actual = XattrMap::try_from(input).unwrap_err(); let expected = Error { rule: Some(1), cause: ErrorKind::IncompleteRule, }; assert_eq!(actual, expected); } #[test] fn test_parser_emits_error_when_multiple_map_rules_exist() { let input = ":map:trusted.:virtiofs.user.::map:trusted.:virtiofs.user.:"; let actual = XattrMap::try_from(input).unwrap_err(); let expected = Error { rule: Some(1), cause: ErrorKind::MapRuleViolation, }; assert_eq!(actual, expected); } #[test] fn test_parser_expands_map_rule_with_empty_key() { let input = ":map::user.virtiofs.:"; let actual = XattrMap::try_from(input).unwrap(); let expected = XattrMap { rules: vec![ Rule { type_: Type::Prefix, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("user.virtiofs.").unwrap(), }, Rule { type_: Type::Bad, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("").unwrap(), }, ], }; assert_eq!(actual, expected); } #[test] fn test_parser_expands_map_rule_with_key_and_prepend() { let input = ":map:trusted.:user.virtiofs.:"; let actual = XattrMap::try_from(input).unwrap(); let expected = XattrMap { rules: vec![ Rule { type_: Type::Prefix, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("trusted.").unwrap(), prepend: CString::new("user.virtiofs.").unwrap(), }, Rule { type_: Type::Bad, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("user.virtiofs.").unwrap(), prepend: CString::new("trusted.").unwrap(), }, Rule { type_: Type::Okay, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("").unwrap(), }, ], }; assert_eq!(actual, expected); } #[test] fn test_parser_emits_error_when_invalid_type_is_used() { let input = ":TOMATOPIRATE:trusted.:virtiofs.user.:"; assert!(XattrMap::try_from(input).is_err()); } #[test] fn test_parser_emits_error_when_invalid_scope_is_used() { let input = "/prefix/helloworld///"; assert!(XattrMap::try_from(input).is_err()); } #[test] fn test_parser_emits_error_when_no_rules_are_provided() { let input = " "; let actual = XattrMap::try_from(input).unwrap_err(); let expected = Error { rule: None, cause: ErrorKind::NoRulesProvided, }; assert_eq!(actual, expected); } #[test] fn test_parser_can_parse_rules_with_different_delimiters() { let input = ":prefix:all:trusted.:user.virtiofs.: /prefix/all/trusted./user.virtiofs./"; let expected_rule = Rule { type_: Type::Prefix, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("trusted.").unwrap(), prepend: CString::new("user.virtiofs.").unwrap(), }; let expected = XattrMap { rules: vec![expected_rule.clone(), expected_rule], }; let actual = XattrMap::try_from(input).unwrap(); assert_eq!(actual, expected); } #[test] fn test_rule_ok_all() { let map = XattrMap { rules: vec![Rule { type_: Type::Okay, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("").unwrap(), }], }; let input = CString::new("user.virtiofs.potato").unwrap(); let actual = map.map_client_xattr(&input).unwrap(); let expected = AppliedRule::Pass(CString::new("user.virtiofs.potato").unwrap().into()); assert_eq!(actual, expected); } #[test] fn test_rule_bad_hides_xattr_names_from_client() { let input = b"security.secret\x00boring_attr".to_vec(); let map = XattrMap { rules: vec![ Rule { type_: Type::Bad, scope: Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("security.").unwrap(), }, Rule { type_: Type::Okay, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("").unwrap(), }, ], }; let actual = map.map_server_xattrlist(input).unwrap(); let expected = b"boring_attr\x00"; assert_eq!(actual.as_slice(), expected); } #[test] fn test_rule_unsupported_hides_xattr_names_from_client() { let input = b"security.secret\x00boring_attr".to_vec(); let map = XattrMap { rules: vec![ Rule { type_: Type::Unsupported, scope: Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("security.").unwrap(), }, Rule { type_: Type::Okay, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("").unwrap(), }, ], }; let actual = map.map_server_xattrlist(input).unwrap(); let expected = b"boring_attr\x00"; assert_eq!(actual.as_slice(), expected); } #[test] fn test_rule_bad_denies_the_client_request() { let map = XattrMap { rules: vec![Rule { type_: Type::Bad, scope: Scope::CLIENT, key: CString::new("").unwrap(), prepend: CString::new("").unwrap(), }], }; let input = CString::new("virtiofs.").unwrap(); let actual = map.map_client_xattr(&input).unwrap(); let expected = AppliedRule::Deny; assert_eq!(actual, expected); } #[test] fn test_rule_unsupported_not_support_the_client_request() { let map = XattrMap { rules: vec![Rule { type_: Type::Unsupported, scope: Scope::CLIENT, key: CString::new("").unwrap(), prepend: CString::new("").unwrap(), }], }; let input = CString::new("virtiofs.").unwrap(); let actual = map.map_client_xattr(&input).unwrap(); let expected = AppliedRule::Unsupported; assert_eq!(actual, expected); } #[test] fn test_rule_prefix_prepends_xattr_names_from_client() { let map = XattrMap { rules: vec![Rule { type_: Type::Prefix, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("trusted.").unwrap(), prepend: CString::new("virtiofs.user.").unwrap(), }], }; let input = CString::new("trusted.secret_thing").unwrap(); let actual = map.map_client_xattr(&input).unwrap(); let expected = AppliedRule::Pass(Cow::Owned( CString::new("virtiofs.user.trusted.secret_thing").unwrap(), )); assert_eq!(actual, expected); } #[test] fn test_rule_prefix_strips_prefixes_from_server() { let map = XattrMap { rules: vec![Rule { type_: Type::Prefix, scope: Scope::SERVER, key: CString::new("").unwrap(), prepend: CString::new("virtiofs.user.").unwrap(), }], }; let list = b"virtiofs.user.x".to_vec(); let actual = map.map_server_xattrlist(list).unwrap(); let expected = b"x\x00".to_vec(); assert_eq!(actual, expected); } #[test] fn test_rule_ok_allows_xattr_names_to_pass_through_unchanged() { let map = XattrMap { rules: vec![Rule { type_: Type::Okay, scope: Scope::CLIENT | Scope::SERVER, key: CString::new("allow.").unwrap(), prepend: CString::new("allow.").unwrap(), }], }; let input = CString::new("allow.y").unwrap(); let actual = map.map_client_xattr(&input).unwrap(); let expected = AppliedRule::Pass(Cow::Owned(CString::new("allow.y").unwrap())); assert_eq!(actual, expected); let list = b"allow.y\x00".to_vec(); let expected = list.clone(); let actual = map.map_server_xattrlist(list).unwrap(); assert_eq!(actual, expected); } } virtiofsd-1.10.0/src/read_dir.rs000064400000000000000000000110051046102023000146170ustar 00000000000000// Copyright 2020 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use crate::filesystem::{DirEntry, DirectoryIterator}; use std::ffi::CStr; use std::io; use std::mem::size_of; use std::ops::{Deref, DerefMut}; use std::os::unix::io::AsRawFd; use vm_memory::ByteValued; #[repr(C, packed)] #[derive(Default, Clone, Copy)] struct LinuxDirent64 { d_ino: libc::ino64_t, d_off: libc::off64_t, d_reclen: libc::c_ushort, d_ty: libc::c_uchar, } unsafe impl ByteValued for LinuxDirent64 {} #[derive(Default)] pub struct ReadDir

{ buf: P, current: usize, end: usize, } impl> ReadDir

{ pub fn new(dir: &D, offset: libc::off64_t, mut buf: P) -> io::Result { // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::lseek64(dir.as_raw_fd(), offset, libc::SEEK_SET) }; if res < 0 { return Err(io::Error::last_os_error()); } // Safe because the kernel guarantees that it will only write to `buf` and we check the // return value. let res = unsafe { libc::syscall( libc::SYS_getdents64, dir.as_raw_fd(), buf.as_mut_ptr() as *mut LinuxDirent64, buf.len() as libc::c_int, ) }; if res < 0 { return Err(io::Error::last_os_error()); } Ok(ReadDir { buf, current: 0, end: res as usize, }) } } impl

ReadDir

{ /// Returns the number of bytes from the internal buffer that have not yet been consumed. pub fn remaining(&self) -> usize { self.end.saturating_sub(self.current) } } impl> DirectoryIterator for ReadDir

{ fn next(&mut self) -> Option { let rem = &self.buf[self.current..self.end]; if rem.is_empty() { return None; } // We only use debug asserts here because these values are coming from the kernel and we // trust them implicitly. debug_assert!( rem.len() >= size_of::(), "not enough space left in `rem`" ); let (front, back) = rem.split_at(size_of::()); let dirent64 = LinuxDirent64::from_slice(front).expect("unable to get LinuxDirent64 from slice"); let namelen = dirent64.d_reclen as usize - size_of::(); debug_assert!(namelen <= back.len(), "back is smaller than `namelen`"); // The kernel will pad the name with additional nul bytes until it is 8-byte aligned so // we need to strip those off here. let name = strip_padding(&back[..namelen]); let entry = DirEntry { ino: dirent64.d_ino, offset: dirent64.d_off as u64, type_: dirent64.d_ty as u32, name, }; debug_assert!( rem.len() >= dirent64.d_reclen as usize, "rem is smaller than `d_reclen`" ); self.current += dirent64.d_reclen as usize; Some(entry) } } // Like `CStr::from_bytes_with_nul` but strips any bytes after the first '\0'-byte. Panics if `b` // doesn't contain any '\0' bytes. fn strip_padding(b: &[u8]) -> &CStr { // It would be nice if we could use memchr here but that's locked behind an unstable gate. let pos = b .iter() .position(|&c| c == 0) .expect("`b` doesn't contain any nul bytes"); // Safe because we are creating this string with the first nul-byte we found so we can // guarantee that it is nul-terminated and doesn't contain any interior nuls. unsafe { CStr::from_bytes_with_nul_unchecked(&b[..=pos]) } } #[cfg(test)] mod test { use super::*; #[test] fn padded_cstrings() { assert_eq!(strip_padding(b".\0\0\0\0\0\0\0").to_bytes(), b"."); assert_eq!(strip_padding(b"..\0\0\0\0\0\0").to_bytes(), b".."); assert_eq!( strip_padding(b"normal cstring\0").to_bytes(), b"normal cstring" ); assert_eq!(strip_padding(b"\0\0\0\0").to_bytes(), b""); assert_eq!( strip_padding(b"interior\0nul bytes\0\0\0").to_bytes(), b"interior" ); } #[test] #[should_panic(expected = "`b` doesn't contain any nul bytes")] fn no_nul_byte() { strip_padding(b"no nul bytes in string"); } } virtiofsd-1.10.0/src/sandbox.rs000064400000000000000000000560701046102023000145170ustar 00000000000000// Copyright 2020 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use crate::{idmap, oslib, util}; use idmap::{GidMap, IdMapSetUpPipeMessage, UidMap}; use std::ffi::CString; use std::fs::{self, File}; use std::io::{Read, Write}; use std::os::unix::io::{AsRawFd, FromRawFd}; use std::process::{self, Command}; use std::str::FromStr; use std::{error, fmt, io}; #[derive(Debug)] pub enum Error { /// Failed to bind mount `/proc/self/fd` into a temporary directory. BindMountProcSelfFd(io::Error), /// Failed to bind mount shared directory. BindMountSharedDir(io::Error), /// Failed to change to the old root directory. ChdirOldRoot(io::Error), /// Failed to change to the new root directory. ChdirNewRoot(io::Error), /// Call to libc::chroot returned an error. Chroot(io::Error), /// Failed to change to the root directory after the chroot call. ChrootChdir(io::Error), /// Failed to clean the properties of the mount point. CleanMount(io::Error), /// Failed to create a temporary directory. CreateTempDir(io::Error), /// Failed to drop supplemental groups. DropSupplementalGroups(io::Error), /// Call to libc::fork returned an error. Fork(io::Error), /// Failed to get the number of supplemental groups. GetSupplementalGroups(io::Error), /// Error bind-mounting a directory. MountBind(io::Error), /// Failed to mount old root. MountOldRoot(io::Error), /// Error mounting proc. MountProc(io::Error), /// Failed to mount new root. MountNewRoot(io::Error), /// Error mounting target directory. MountTarget(io::Error), /// Failed to open `/proc/self/mountinfo`. OpenMountinfo(io::Error), /// Failed to open new root. OpenNewRoot(io::Error), /// Failed to open old root. OpenOldRoot(io::Error), /// Failed to open `/proc/self`. OpenProcSelf(io::Error), /// Failed to open `/proc/self/fd`. OpenProcSelfFd(io::Error), /// Error switching root directory. PivotRoot(io::Error), /// Failed to remove temporary directory. RmdirTempDir(io::Error), /// Failed to lazily unmount old root. UmountOldRoot(io::Error), /// Failed to lazily unmount temporary directory. UmountTempDir(io::Error), /// Call to libc::unshare returned an error. Unshare(io::Error), /// Failed to execute `newgidmap(1)`. WriteGidMap(String), /// Failed to write to `/proc/self/setgroups`. WriteSetGroups(io::Error), /// Failed to execute `newuidmap(1)`. WriteUidMap(String), /// Sandbox mode unavailable for non-privileged users SandboxModeInvalidUID, /// Setting uid_map is only allowed inside a namespace for non-privileged users SandboxModeInvalidUidMap, /// Setting gid_map is only allowed inside a namespace for non-privileged users SandboxModeInvalidGidMap, } impl error::Error for Error {} impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use self::Error::{ SandboxModeInvalidGidMap, SandboxModeInvalidUID, SandboxModeInvalidUidMap, WriteGidMap, WriteUidMap, }; match self { SandboxModeInvalidUID => { write!( f, "sandbox mode 'chroot' can only be used by \ root (Use '--sandbox namespace' instead)" ) } SandboxModeInvalidUidMap => { write!( f, "uid_map can only be used by unprivileged user where sandbox mod is namespace \ (Use '--sandbox namespace' instead)" ) } SandboxModeInvalidGidMap => { write!( f, "gid_map can only be used by unprivileged user where sandbox mod is namespace \ (Use '--sandbox namespace' instead)" ) } WriteUidMap(msg) => write!(f, "write to uid map failed: {msg}"), WriteGidMap(msg) => write!(f, "write to gid map failed: {msg}"), _ => write!(f, "{self:?}"), } } } /// Mechanism to be used for setting up the sandbox. #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum SandboxMode { /// Create the sandbox using Linux namespaces. Namespace, /// Create the sandbox using chroot. Chroot, /// Don't attempt to isolate the process inside a sandbox. None, } impl FromStr for SandboxMode { type Err = &'static str; fn from_str(s: &str) -> Result { match s.to_lowercase().as_str() { "namespace" => Ok(SandboxMode::Namespace), "chroot" => Ok(SandboxMode::Chroot), "none" => Ok(SandboxMode::None), _ => Err("Unknown sandbox mode"), } } } /// A helper for creating a sandbox for isolating the service. pub struct Sandbox { /// The directory that is going to be shared with the VM. The sandbox will be constructed on top /// of this directory. shared_dir: String, /// A `File` object for `/proc/self/fd` obtained from the sandboxed context. proc_self_fd: Option, /// A `File` object for `/proc/self/mountinfo` obtained from the sandboxed context. mountinfo_fd: Option, /// Mechanism to be used for setting up the sandbox. sandbox_mode: SandboxMode, /// UidMap to be used for `newuidmap(1)` command line arguments uid_map: Option, /// GidMap to be used for `newgidmap(1)` command line arguments gid_map: Option, } impl Sandbox { pub fn new( shared_dir: String, sandbox_mode: SandboxMode, uid_map: Option, gid_map: Option, ) -> io::Result { let shared_dir_rp = fs::canonicalize(shared_dir)?; let shared_dir_rp_str = shared_dir_rp .to_str() .ok_or_else(|| io::Error::from_raw_os_error(libc::EINVAL))?; Ok(Sandbox { shared_dir: shared_dir_rp_str.into(), proc_self_fd: None, mountinfo_fd: None, sandbox_mode, uid_map, gid_map, }) } // Make `self.shared_dir` our root directory, and get isolated file descriptors for // `/proc/self/fd` and '/proc/self/mountinfo`. // // This is based on virtiofsd's setup_namespaces() and setup_mounts(), and it's very similar to // the strategy used in containers. Consists on a careful sequence of mounts and bind-mounts to // ensure it's not possible to escape the sandbox through `self.shared_dir` nor the file // descriptor obtained for `/proc/self/fd`. // // It's ugly, but it's the only way until Linux implements a proper containerization API. fn setup_mounts(&mut self) -> Result<(), Error> { // Open an FD to `/proc/self` so we can later open `/proc/self/mountinfo`. // (If we opened `/proc/self/mountinfo` now, it would appear empty by the end of this // function, which is why we need to defer opening it until then.) let c_proc_self = CString::new("/proc/self").unwrap(); let proc_self_raw = unsafe { libc::open(c_proc_self.as_ptr(), libc::O_PATH) }; if proc_self_raw < 0 { return Err(Error::OpenProcSelf(std::io::Error::last_os_error())); } // Encapsulate the `/proc/self` FD in a `File` object so it is closed when this function // returns let proc_self = unsafe { File::from_raw_fd(proc_self_raw) }; // Ensure our mount changes don't affect the parent mount namespace. oslib::mount(None, "/", None, libc::MS_SLAVE | libc::MS_REC).map_err(Error::CleanMount)?; // Mount `/proc` in this context. oslib::mount( "proc".into(), "/proc", "proc".into(), libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_NOSUID | libc::MS_RELATIME, ) .map_err(Error::MountProc)?; // Bind-mount `/proc/self/fd` onto /proc preventing access to ancestor // directories. oslib::mount("/proc/self/fd".into(), "/proc", None, libc::MS_BIND) .map_err(Error::BindMountProcSelfFd)?; // Obtain a file descriptor to /proc/self/fd/ by opening bind-mounted /proc directory. let c_proc_dir = CString::new("/proc").unwrap(); let proc_self_fd = unsafe { libc::open(c_proc_dir.as_ptr(), libc::O_PATH) }; if proc_self_fd < 0 { return Err(Error::OpenProcSelfFd(std::io::Error::last_os_error())); } // Safe because we just opened this fd. self.proc_self_fd = Some(unsafe { File::from_raw_fd(proc_self_fd) }); // Bind-mount `self.shared_dir` on itself so we can use as new root on `pivot_root` syscall. oslib::mount( self.shared_dir.as_str().into(), self.shared_dir.as_str(), None, libc::MS_BIND | libc::MS_REC, ) .map_err(Error::BindMountSharedDir)?; // Get a file descriptor to our old root so we can reference it after switching root. let c_root_dir = CString::new("/").unwrap(); let oldroot_fd = unsafe { libc::open( c_root_dir.as_ptr(), libc::O_DIRECTORY | libc::O_RDONLY | libc::O_CLOEXEC, ) }; if oldroot_fd < 0 { return Err(Error::OpenOldRoot(std::io::Error::last_os_error())); } // Get a file descriptor to the new root so we can reference it after switching root. let c_shared_dir = CString::new(self.shared_dir.clone()).unwrap(); let newroot_fd = unsafe { libc::open( c_shared_dir.as_ptr(), libc::O_DIRECTORY | libc::O_RDONLY | libc::O_CLOEXEC, ) }; if newroot_fd < 0 { return Err(Error::OpenNewRoot(std::io::Error::last_os_error())); } // Change to new root directory to prepare for `pivot_root` syscall. oslib::fchdir(newroot_fd).map_err(Error::ChdirNewRoot)?; // Call to `pivot_root` using `.` as both new and old root. let c_current_dir = CString::new(".").unwrap(); let ret = unsafe { libc::syscall( libc::SYS_pivot_root, c_current_dir.as_ptr(), c_current_dir.as_ptr(), ) }; if ret < 0 { return Err(Error::PivotRoot(std::io::Error::last_os_error())); } // Change to old root directory to prepare for cleaning up and unmounting it. oslib::fchdir(oldroot_fd).map_err(Error::ChdirOldRoot)?; // Clean up old root to avoid mount namespace propagation. oslib::mount(None, ".", None, libc::MS_SLAVE | libc::MS_REC).map_err(Error::CleanMount)?; // Lazily unmount old root. oslib::umount2(".", libc::MNT_DETACH).map_err(Error::UmountOldRoot)?; // Change to new root. oslib::fchdir(newroot_fd).map_err(Error::ChdirNewRoot)?; // We no longer need these file descriptors, so close them. unsafe { libc::close(newroot_fd) }; unsafe { libc::close(oldroot_fd) }; // Open `/proc/self/mountinfo` now let c_mountinfo = CString::new("mountinfo").unwrap(); let mountinfo_fd = unsafe { libc::openat(proc_self.as_raw_fd(), c_mountinfo.as_ptr(), libc::O_RDONLY) }; if mountinfo_fd < 0 { return Err(Error::OpenMountinfo(std::io::Error::last_os_error())); } // Safe because we just opened this fd. self.mountinfo_fd = Some(unsafe { File::from_raw_fd(mountinfo_fd) }); Ok(()) } /// Sets mappings for the given uid and gid. fn setup_id_mappings( &self, uid_map: Option<&UidMap>, gid_map: Option<&GidMap>, pid: i32, ) -> Result<(), Error> { // Unprivileged user can not set any mapping without any restriction. // Therefore, newuidmap/newgidmap is used instead of writing directly // into proc/[pid]/{uid,gid}_map. let mut newuidmap = Command::new("newuidmap"); newuidmap.arg(pid.to_string()); if let Some(uid_map) = uid_map { newuidmap.arg(uid_map.inside_uid.to_string()); newuidmap.arg(uid_map.outside_uid.to_string()); newuidmap.arg(uid_map.count.to_string()); } else { // Set up 1-to-1 mappings for our current uid. let current_uid = unsafe { libc::geteuid() }; newuidmap.arg(current_uid.to_string()); newuidmap.arg(current_uid.to_string()); newuidmap.arg("1"); } let mut output = newuidmap.output().map_err(|_| { Error::WriteUidMap(format!( "failed to execute newuidmap: {}", io::Error::last_os_error() )) })?; if !output.status.success() { return Err(Error::WriteUidMap( String::from_utf8_lossy(&output.stderr).to_string(), )); } let mut newgidmap = Command::new("newgidmap"); newgidmap.arg(pid.to_string()); if let Some(gid_map) = gid_map { newgidmap.arg(gid_map.inside_gid.to_string()); newgidmap.arg(gid_map.outside_gid.to_string()); newgidmap.arg(gid_map.count.to_string()); } else { // Set up 1-to-1 mappings for our current gid. let current_gid = unsafe { libc::getegid() }; newgidmap.arg(current_gid.to_string()); newgidmap.arg(current_gid.to_string()); newgidmap.arg("1"); } output = newgidmap.output().map_err(|_| { Error::WriteGidMap(format!( "failed to execute newgidmap: {}", io::Error::last_os_error() )) })?; if !output.status.success() { return Err(Error::WriteGidMap( String::from_utf8_lossy(&output.stderr).to_string(), )); } Ok(()) } pub fn enter_namespace(&mut self) -> Result<(), Error> { let uid = unsafe { libc::geteuid() }; let flags = if uid == 0 { libc::CLONE_NEWPID | libc::CLONE_NEWNS | libc::CLONE_NEWNET } else { // If running as an unprivileged user, rely on user_namespaces(7) for isolation. libc::CLONE_NEWPID | libc::CLONE_NEWNS | libc::CLONE_NEWNET | libc::CLONE_NEWUSER }; let (mut x_reader, mut x_writer) = oslib::pipe().unwrap(); let (mut y_reader, mut y_writer) = oslib::pipe().unwrap(); let pid = util::sfork().map_err(Error::Fork)?; let mut output = [0]; // First child is only responsible to setup id mapping // from outside of the main thread's namespace. // Pipe is used for synchronization between the main thread and the first child. // That will guarantee the mapping is done before the main thread gets running. if pid == 0 { // First child // Dropping the other end of the pipes drop(x_writer); drop(y_reader); // This is waiting until unshare() returns x_reader.read_exact(&mut output).unwrap(); assert_eq!(output[0], IdMapSetUpPipeMessage::Request as u8); // Setup uid/gid mappings if uid != 0 { let ppid = unsafe { libc::getppid() }; if let Err(error) = self.setup_id_mappings(self.uid_map.as_ref(), self.gid_map.as_ref(), ppid) { // We don't really need to close the pipes here, since the OS will close the FDs // after the process exits. But let's do it explicitly to signal an error to the // other end of the pipe. drop(x_reader); drop(y_writer); error!("sandbox: couldn't setup id mappings: {}", error); process::exit(1); }; } // Signal that mapping is done y_writer .write_all(&[IdMapSetUpPipeMessage::Done as u8]) .unwrap_or_else(|_| process::exit(1)); // Terminate this child process::exit(0); } else { // This is the parent let ret = unsafe { libc::unshare(flags) }; if ret != 0 { return Err(Error::Unshare(std::io::Error::last_os_error())); } // Dropping the other end of the pipes drop(x_reader); drop(y_writer); // Signal the first child to go ahead and setup the id mappings x_writer .write_all(&[IdMapSetUpPipeMessage::Request as u8]) .unwrap(); // Receive the signal that mapping is done. If the child process exits // before setting up the mapping, closing the pipe before sending the // message, `read_exact()` will fail with `UnexpectedEof`. y_reader .read_exact(&mut output) .unwrap_or_else(|_| process::exit(1)); assert_eq!(output[0], IdMapSetUpPipeMessage::Done as u8); let mut status = 0_i32; let _ = unsafe { libc::waitpid(pid, &mut status, 0) }; // Set the process inside the user namespace as root let mut ret = unsafe { libc::setresuid(0, 0, 0) }; if ret != 0 { warn!("Couldn't set the process uid as root: {}", ret); } ret = unsafe { libc::setresgid(0, 0, 0) }; if ret != 0 { warn!("Couldn't set the process gid as root: {}", ret); } let child = util::sfork().map_err(Error::Fork)?; if child == 0 { // Second child self.setup_mounts()?; Ok(()) } else { // This is the parent util::wait_for_child(child); // This never returns. } } } pub fn enter_chroot(&mut self) -> Result<(), Error> { let c_proc_self_fd = CString::new("/proc/self/fd").unwrap(); let proc_self_fd = unsafe { libc::open(c_proc_self_fd.as_ptr(), libc::O_PATH) }; if proc_self_fd < 0 { return Err(Error::OpenProcSelfFd(std::io::Error::last_os_error())); } // Safe because we just opened this fd. self.proc_self_fd = Some(unsafe { File::from_raw_fd(proc_self_fd) }); let c_mountinfo = CString::new("/proc/self/mountinfo").unwrap(); let mountinfo_fd = unsafe { libc::open(c_mountinfo.as_ptr(), libc::O_RDONLY) }; if mountinfo_fd < 0 { return Err(Error::OpenMountinfo(std::io::Error::last_os_error())); } // Safe because we just opened this fd. self.mountinfo_fd = Some(unsafe { File::from_raw_fd(mountinfo_fd) }); let c_shared_dir = CString::new(self.shared_dir.clone()).unwrap(); let ret = unsafe { libc::chroot(c_shared_dir.as_ptr()) }; if ret != 0 { return Err(Error::Chroot(std::io::Error::last_os_error())); } let c_root_dir = CString::new("/").unwrap(); let ret = unsafe { libc::chdir(c_root_dir.as_ptr()) }; if ret != 0 { return Err(Error::ChrootChdir(std::io::Error::last_os_error())); } Ok(()) } fn must_drop_supplemental_groups(&self) -> Result { let uid = unsafe { libc::geteuid() }; if uid != 0 { return Ok(false); } let uid_mmap_data = fs::read_to_string("/proc/self/uid_map").map_err(Error::DropSupplementalGroups)?; let uid_map: Vec<_> = uid_mmap_data.split_whitespace().collect(); let gid_map_data = fs::read_to_string("/proc/self/gid_map").map_err(Error::DropSupplementalGroups)?; let gid_map: Vec<_> = gid_map_data.split_whitespace().collect(); let setgroups = fs::read_to_string("/proc/self/setgroups").map_err(Error::DropSupplementalGroups)?; // A single line mapping only has 3 fields, and the 'count' field should // be 1. let single_uid_mapping = uid_map.len() == 3 && uid_map[2] == "1"; let single_gid_mapping = gid_map.len() == 3 && gid_map[2] == "1"; Ok(setgroups.trim() != "deny" || !single_uid_mapping || !single_gid_mapping) } fn drop_supplemental_groups(&self) -> Result<(), Error> { let ngroups = unsafe { libc::getgroups(0, std::ptr::null_mut()) }; if ngroups < 0 { return Err(Error::GetSupplementalGroups(std::io::Error::last_os_error())); } else if ngroups != 0 { let ret = unsafe { libc::setgroups(0, std::ptr::null()) }; if ret != 0 { return Err(Error::DropSupplementalGroups( std::io::Error::last_os_error(), )); } } Ok(()) } /// Set up sandbox, pub fn enter(&mut self) -> Result<(), Error> { let uid = unsafe { libc::geteuid() }; if uid != 0 && self.sandbox_mode == SandboxMode::Chroot { return Err(Error::SandboxModeInvalidUID); } if self.uid_map.is_some() && (uid == 0 || self.sandbox_mode != SandboxMode::Namespace) { return Err(Error::SandboxModeInvalidUidMap); } if self.gid_map.is_some() && (uid == 0 || self.sandbox_mode != SandboxMode::Namespace) { return Err(Error::SandboxModeInvalidGidMap); } // We must drop supplemental groups membership if we support switching // between arbitrary uids/gids, unless the following conditions are met: // we're not running as root or we are inside a user namespace with only // one uid and gid mapping and '/proc/self/setgroups' is equal to // "deny". In both of these cases, no arbitrary uid/gid switching is // possible and thus there's no need to drop supplemental groups. In // both of these scenarios calling setgroups() is also not allowed so we // avoid calling it since we know it will return a privilege error. if self.must_drop_supplemental_groups()? { self.drop_supplemental_groups()?; } match self.sandbox_mode { SandboxMode::Namespace => self.enter_namespace(), SandboxMode::Chroot => self.enter_chroot(), SandboxMode::None => Ok(()), } } pub fn get_proc_self_fd(&mut self) -> Option { self.proc_self_fd.take() } pub fn get_mountinfo_fd(&mut self) -> Option { self.mountinfo_fd.take() } pub fn get_root_dir(&self) -> String { match self.sandbox_mode { SandboxMode::Namespace | SandboxMode::Chroot => "/".to_string(), SandboxMode::None => self.shared_dir.clone(), } } /// Return the prefix to strip from /proc/self/mountinfo entries to get paths that are actually /// accessible in our sandbox pub fn get_mountinfo_prefix(&self) -> Option { match self.sandbox_mode { SandboxMode::Namespace | SandboxMode::None => None, SandboxMode::Chroot => Some(self.shared_dir.clone()), } } } virtiofsd-1.10.0/src/seccomp.rs000064400000000000000000000171421046102023000145070ustar 00000000000000// Copyright 2020 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use libseccomp_sys::{ seccomp_init, seccomp_load, seccomp_release, seccomp_rule_add, SCMP_ACT_ALLOW, SCMP_ACT_KILL_PROCESS, SCMP_ACT_LOG, SCMP_ACT_TRAP, }; use std::convert::TryInto; use std::{error, fmt}; #[derive(Debug)] pub enum Error { /// Error allowing a syscall AllowSeccompSyscall(i32), /// Cannot load seccomp filter LoadSeccompFilter, /// Cannot initialize seccomp context InitSeccompContext, } impl error::Error for Error {} impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "virtiofsd_seccomp_error: {self:?}") } } #[derive(Copy, Clone, Debug)] pub enum SeccompAction { Allow, Kill, Log, Trap, } impl From for u32 { fn from(action: SeccompAction) -> u32 { match action { SeccompAction::Allow => SCMP_ACT_ALLOW, SeccompAction::Kill => SCMP_ACT_KILL_PROCESS, SeccompAction::Log => SCMP_ACT_LOG, SeccompAction::Trap => SCMP_ACT_TRAP, } } } macro_rules! allow_syscall { ($ctx:ident, $syscall:expr) => { let syscall_nr: i32 = $syscall.try_into().unwrap(); let ret = unsafe { seccomp_rule_add($ctx, SCMP_ACT_ALLOW, syscall_nr, 0) }; if ret != 0 { return Err(Error::AllowSeccompSyscall(syscall_nr)); } }; } pub fn enable_seccomp(action: SeccompAction, allow_remote_logging: bool) -> Result<(), Error> { let ctx = unsafe { seccomp_init(action.into()) }; if ctx.is_null() { return Err(Error::InitSeccompContext); } allow_syscall!(ctx, libc::SYS_accept4); allow_syscall!(ctx, libc::SYS_brk); allow_syscall!(ctx, libc::SYS_capget); // For CAP_FSETID allow_syscall!(ctx, libc::SYS_capset); allow_syscall!(ctx, libc::SYS_clock_gettime); allow_syscall!(ctx, libc::SYS_clone); allow_syscall!(ctx, libc::SYS_clone3); allow_syscall!(ctx, libc::SYS_close); allow_syscall!(ctx, libc::SYS_copy_file_range); allow_syscall!(ctx, libc::SYS_dup); #[cfg(any( target_arch = "x86_64", target_arch = "s390x", target_arch = "powerpc64" ))] allow_syscall!(ctx, libc::SYS_epoll_create); allow_syscall!(ctx, libc::SYS_epoll_create1); allow_syscall!(ctx, libc::SYS_epoll_ctl); allow_syscall!(ctx, libc::SYS_epoll_pwait); #[cfg(any( target_arch = "x86_64", target_arch = "s390x", target_arch = "powerpc64" ))] allow_syscall!(ctx, libc::SYS_epoll_wait); allow_syscall!(ctx, libc::SYS_eventfd2); allow_syscall!(ctx, libc::SYS_exit); allow_syscall!(ctx, libc::SYS_exit_group); allow_syscall!(ctx, libc::SYS_fallocate); allow_syscall!(ctx, libc::SYS_fchdir); allow_syscall!(ctx, libc::SYS_fchmod); allow_syscall!(ctx, libc::SYS_fchmodat); allow_syscall!(ctx, libc::SYS_fchownat); allow_syscall!(ctx, libc::SYS_fcntl); allow_syscall!(ctx, libc::SYS_fdatasync); allow_syscall!(ctx, libc::SYS_fgetxattr); allow_syscall!(ctx, libc::SYS_flistxattr); allow_syscall!(ctx, libc::SYS_flock); allow_syscall!(ctx, libc::SYS_fremovexattr); allow_syscall!(ctx, libc::SYS_fsetxattr); #[cfg(not(target_arch = "loongarch64"))] allow_syscall!(ctx, libc::SYS_fstat); #[cfg(any(target_arch = "s390x", target_arch = "powerpc64"))] allow_syscall!(ctx, libc::SYS_fstatfs64); allow_syscall!(ctx, libc::SYS_fstatfs); allow_syscall!(ctx, libc::SYS_fsync); allow_syscall!(ctx, libc::SYS_ftruncate); allow_syscall!(ctx, libc::SYS_futex); #[cfg(any( target_arch = "x86_64", target_arch = "s390x", target_arch = "powerpc64" ))] allow_syscall!(ctx, libc::SYS_getdents); allow_syscall!(ctx, libc::SYS_getdents64); allow_syscall!(ctx, libc::SYS_getegid); allow_syscall!(ctx, libc::SYS_geteuid); allow_syscall!(ctx, libc::SYS_getpid); allow_syscall!(ctx, libc::SYS_getrandom); allow_syscall!(ctx, libc::SYS_gettid); allow_syscall!(ctx, libc::SYS_gettimeofday); allow_syscall!(ctx, libc::SYS_getxattr); allow_syscall!(ctx, libc::SYS_linkat); allow_syscall!(ctx, libc::SYS_listxattr); allow_syscall!(ctx, libc::SYS_lseek); allow_syscall!(ctx, libc::SYS_madvise); allow_syscall!(ctx, libc::SYS_membarrier); allow_syscall!(ctx, libc::SYS_mkdirat); allow_syscall!(ctx, libc::SYS_mknodat); allow_syscall!(ctx, libc::SYS_mmap); allow_syscall!(ctx, libc::SYS_mprotect); allow_syscall!(ctx, libc::SYS_mremap); allow_syscall!(ctx, libc::SYS_munmap); allow_syscall!(ctx, libc::SYS_name_to_handle_at); #[cfg(not(target_arch = "loongarch64"))] allow_syscall!(ctx, libc::SYS_newfstatat); #[cfg(target_arch = "powerpc64")] allow_syscall!(ctx, libc::SYS__llseek); #[cfg(any( target_arch = "x86_64", target_arch = "s390x", target_arch = "powerpc64" ))] allow_syscall!(ctx, libc::SYS_open); allow_syscall!(ctx, libc::SYS_openat); allow_syscall!(ctx, libc::SYS_openat2); allow_syscall!(ctx, libc::SYS_open_by_handle_at); allow_syscall!(ctx, libc::SYS_prctl); // TODO restrict to just PR_SET_NAME? allow_syscall!(ctx, libc::SYS_preadv); allow_syscall!(ctx, libc::SYS_pread64); allow_syscall!(ctx, libc::SYS_pwritev2); allow_syscall!(ctx, libc::SYS_pwrite64); allow_syscall!(ctx, libc::SYS_read); allow_syscall!(ctx, libc::SYS_readlinkat); allow_syscall!(ctx, libc::SYS_recvmsg); #[cfg(not(any(target_arch = "loongarch64", target_arch = "riscv64")))] allow_syscall!(ctx, libc::SYS_renameat); allow_syscall!(ctx, libc::SYS_renameat2); allow_syscall!(ctx, libc::SYS_removexattr); #[cfg(target_env = "gnu")] allow_syscall!(ctx, libc::SYS_rseq); allow_syscall!(ctx, libc::SYS_rt_sigaction); allow_syscall!(ctx, libc::SYS_rt_sigprocmask); allow_syscall!(ctx, libc::SYS_rt_sigreturn); allow_syscall!(ctx, libc::SYS_sched_getaffinity); // used by thread_pool allow_syscall!(ctx, libc::SYS_sched_yield); allow_syscall!(ctx, libc::SYS_sendmsg); allow_syscall!(ctx, libc::SYS_setgroups); allow_syscall!(ctx, libc::SYS_setresgid); allow_syscall!(ctx, libc::SYS_setresuid); //allow_syscall!(ctx, libc::SYS_setresgid32); Needed on some platforms, //allow_syscall!(ctx, libc::SYS_setresuid32); Needed on some platforms allow_syscall!(ctx, libc::SYS_set_robust_list); allow_syscall!(ctx, libc::SYS_setxattr); allow_syscall!(ctx, libc::SYS_sigaltstack); #[cfg(target_arch = "s390x")] allow_syscall!(ctx, libc::SYS_sigreturn); allow_syscall!(ctx, libc::SYS_statx); allow_syscall!(ctx, libc::SYS_symlinkat); allow_syscall!(ctx, libc::SYS_syncfs); #[cfg(target_arch = "x86_64")] allow_syscall!(ctx, libc::SYS_time); // Rarely needed, except on static builds allow_syscall!(ctx, libc::SYS_tgkill); allow_syscall!(ctx, libc::SYS_umask); #[cfg(any( target_arch = "x86_64", target_arch = "s390x", target_arch = "powerpc64" ))] allow_syscall!(ctx, libc::SYS_unlink); allow_syscall!(ctx, libc::SYS_unlinkat); allow_syscall!(ctx, libc::SYS_unshare); allow_syscall!(ctx, libc::SYS_utimensat); allow_syscall!(ctx, libc::SYS_write); allow_syscall!(ctx, libc::SYS_writev); if allow_remote_logging { allow_syscall!(ctx, libc::SYS_sendto); // Required by syslog } let ret = unsafe { seccomp_load(ctx) }; if ret != 0 { return Err(Error::LoadSeccompFilter); } unsafe { seccomp_release(ctx) }; Ok(()) } virtiofsd-1.10.0/src/server.rs000064400000000000000000001665701046102023000143760ustar 00000000000000// Copyright 2019 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use super::fs_cache_req_handler::FsCacheReqHandler; use crate::descriptor_utils::{Reader, Writer}; use crate::filesystem::{ Context, DirEntry, DirectoryIterator, Entry, Extensions, FileSystem, GetxattrReply, ListxattrReply, SecContext, ZeroCopyReader, ZeroCopyWriter, }; use crate::fuse::*; use crate::passthrough::util::einval; use crate::{oslib, Error, Result}; use std::convert::{TryFrom, TryInto}; use std::ffi::{CStr, CString}; use std::fs::File; use std::io::{self, Read, Write}; use std::mem::{size_of, MaybeUninit}; use std::sync::atomic::{AtomicU64, Ordering}; use std::time::Duration; use vm_memory::ByteValued; const FUSE_BUFFER_HEADER_SIZE: u32 = 0x1000; const MAX_BUFFER_SIZE: u32 = 1 << 20; const DIRENT_PADDING: [u8; 8] = [0; 8]; const CURRENT_DIR_CSTR: &[u8] = b"."; const PARENT_DIR_CSTR: &[u8] = b".."; struct ZcReader<'a>(Reader<'a>); impl<'a> ZeroCopyReader for ZcReader<'a> { fn read_to( &mut self, f: &File, count: usize, off: u64, flags: Option, ) -> io::Result { self.0.read_to_at(f, count, off, flags) } } impl<'a> io::Read for ZcReader<'a> { fn read(&mut self, buf: &mut [u8]) -> io::Result { self.0.read(buf) } } struct ZcWriter<'a>(Writer<'a>); impl<'a> ZeroCopyWriter for ZcWriter<'a> { fn write_from(&mut self, f: &File, count: usize, off: u64) -> io::Result { self.0.write_from_at(f, count, off) } } impl<'a> io::Write for ZcWriter<'a> { fn write(&mut self, buf: &[u8]) -> io::Result { self.0.write(buf) } fn flush(&mut self) -> io::Result<()> { self.0.flush() } } pub struct Server { fs: F, options: AtomicU64, } impl Server { pub fn new(fs: F) -> Server { Server { fs, options: AtomicU64::new(FsOptions::empty().bits()), } } #[allow(clippy::cognitive_complexity)] pub fn handle_message( &self, mut r: Reader, w: Writer, vu_req: Option<&mut T>, ) -> Result { let in_header: InHeader = r.read_obj().map_err(Error::DecodeMessage)?; if in_header.len > (MAX_BUFFER_SIZE + FUSE_BUFFER_HEADER_SIZE) { return reply_error( io::Error::from_raw_os_error(libc::ENOMEM), in_header.unique, w, ); } if let Ok(opcode) = Opcode::try_from(in_header.opcode) { debug!( "Received request: opcode={:?} ({}), inode={}, unique={}, pid={}", opcode, in_header.opcode, in_header.nodeid, in_header.unique, in_header.pid ); match opcode { Opcode::Lookup => self.lookup(in_header, r, w), Opcode::Forget => self.forget(in_header, r), // No reply. Opcode::Getattr => self.getattr(in_header, r, w), Opcode::Setattr => self.setattr(in_header, r, w), Opcode::Readlink => self.readlink(in_header, w), Opcode::Symlink => self.symlink(in_header, r, w), Opcode::Mknod => self.mknod(in_header, r, w), Opcode::Mkdir => self.mkdir(in_header, r, w), Opcode::Unlink => self.unlink(in_header, r, w), Opcode::Rmdir => self.rmdir(in_header, r, w), Opcode::Rename => self.rename(in_header, r, w), Opcode::Link => self.link(in_header, r, w), Opcode::Open => self.open(in_header, r, w), Opcode::Read => self.read(in_header, r, w), Opcode::Write => self.write(in_header, r, w), Opcode::Statfs => self.statfs(in_header, w), Opcode::Release => self.release(in_header, r, w), Opcode::Fsync => self.fsync(in_header, r, w), Opcode::Setxattr => self.setxattr(in_header, r, w), Opcode::Getxattr => self.getxattr(in_header, r, w), Opcode::Listxattr => self.listxattr(in_header, r, w), Opcode::Removexattr => self.removexattr(in_header, r, w), Opcode::Flush => self.flush(in_header, r, w), Opcode::Init => self.init(in_header, r, w), Opcode::Opendir => self.opendir(in_header, r, w), Opcode::Readdir => self.readdir(in_header, r, w), Opcode::Releasedir => self.releasedir(in_header, r, w), Opcode::Fsyncdir => self.fsyncdir(in_header, r, w), Opcode::Getlk => self.getlk(in_header, r, w), Opcode::Setlk => self.setlk(in_header, r, w), Opcode::Setlkw => self.setlkw(in_header, r, w), Opcode::Access => self.access(in_header, r, w), Opcode::Create => self.create(in_header, r, w), Opcode::Interrupt => Ok(self.interrupt(in_header)), Opcode::Bmap => self.bmap(in_header, r, w), Opcode::Destroy => Ok(self.destroy()), Opcode::Ioctl => self.ioctl(in_header, r, w), Opcode::Poll => self.poll(in_header, r, w), Opcode::NotifyReply => self.notify_reply(in_header, r, w), Opcode::BatchForget => self.batch_forget(in_header, r, w), Opcode::Fallocate => self.fallocate(in_header, r, w), Opcode::Readdirplus => self.readdirplus(in_header, r, w), Opcode::Rename2 => self.rename2(in_header, r, w), Opcode::Lseek => self.lseek(in_header, r, w), Opcode::CopyFileRange => self.copyfilerange(in_header, r, w), Opcode::SetupMapping => self.setupmapping(in_header, r, w, vu_req), Opcode::RemoveMapping => self.removemapping(in_header, r, w, vu_req), Opcode::Syncfs => self.syncfs(in_header, w), Opcode::TmpFile => self.tmpfile(in_header, r, w), } } else { debug!( "Received unknown request: opcode={}, inode={}", in_header.opcode, in_header.nodeid ); reply_error( io::Error::from_raw_os_error(libc::ENOSYS), in_header.unique, w, ) } } fn setupmapping( &self, in_header: InHeader, mut r: Reader, w: Writer, vu_req: Option<&mut T>, ) -> Result { if let Some(req) = vu_req { let SetupmappingIn { fh, foffset, len, flags, moffset, } = r.read_obj().map_err(Error::DecodeMessage)?; match self.fs.setupmapping( Context::from(in_header), in_header.nodeid.into(), fh.into(), foffset, len, flags, moffset, req, ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } else { reply_error( io::Error::from_raw_os_error(libc::EINVAL), in_header.unique, w, ) } } fn removemapping( &self, in_header: InHeader, mut r: Reader, w: Writer, vu_req: Option<&mut T>, ) -> Result { if let Some(req) = vu_req { let RemovemappingIn { count } = r.read_obj().map_err(Error::DecodeMessage)?; if let Some(size) = (count as usize).checked_mul(size_of::()) { if size > MAX_BUFFER_SIZE as usize { return reply_error( io::Error::from_raw_os_error(libc::ENOMEM), in_header.unique, w, ); } } else { return reply_error( io::Error::from_raw_os_error(libc::EOVERFLOW), in_header.unique, w, ); } let mut requests = Vec::with_capacity(count as usize); for _ in 0..count { requests.push( r.read_obj::() .map_err(Error::DecodeMessage)?, ); } match self .fs .removemapping(Context::from(in_header), requests, req) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } else { reply_error( io::Error::from_raw_os_error(libc::EINVAL), in_header.unique, w, ) } } fn lookup(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let namelen = (in_header.len as usize) .checked_sub(size_of::()) .ok_or(Error::InvalidHeaderLength)?; let mut buf = vec![0u8; namelen]; r.read_exact(&mut buf).map_err(Error::DecodeMessage)?; let name = bytes_to_cstr(buf.as_ref())?; match self .fs .lookup(Context::from(in_header), in_header.nodeid.into(), name) { Ok(entry) => { let out = EntryOut::from(entry); reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn forget(&self, in_header: InHeader, mut r: Reader) -> Result { let ForgetIn { nlookup } = r.read_obj().map_err(Error::DecodeMessage)?; self.fs .forget(Context::from(in_header), in_header.nodeid.into(), nlookup); // There is no reply for forget messages. Ok(0) } fn getattr(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let GetattrIn { flags, fh, .. } = r.read_obj().map_err(Error::DecodeMessage)?; let handle = if (flags & GETATTR_FH) != 0 { Some(fh.into()) } else { None }; match self .fs .getattr(Context::from(in_header), in_header.nodeid.into(), handle) { Ok((st, timeout)) => { let out = AttrOut { attr_valid: timeout.as_secs(), attr_valid_nsec: timeout.subsec_nanos(), dummy: 0, attr: st.into(), }; reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn setattr(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let setattr_in: SetattrIn = r.read_obj().map_err(Error::DecodeMessage)?; let handle = if setattr_in.valid & FATTR_FH != 0 { Some(setattr_in.fh.into()) } else { None }; let valid = SetattrValid::from_bits_truncate(setattr_in.valid); let st: libc::stat64 = setattr_in.into(); match self.fs.setattr( Context::from(in_header), in_header.nodeid.into(), st, handle, valid, ) { Ok((st, timeout)) => { let out = AttrOut { attr_valid: timeout.as_secs(), attr_valid_nsec: timeout.subsec_nanos(), dummy: 0, attr: st.into(), }; reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn readlink(&self, in_header: InHeader, w: Writer) -> Result { match self .fs .readlink(Context::from(in_header), in_header.nodeid.into()) { Ok(linkname) => { // We need to disambiguate the option type here even though it is `None`. reply_ok(None::, Some(&linkname), in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn symlink(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { // Unfortunately the name and linkname are encoded one after another and // separated by a nul character. let len = (in_header.len as usize) .checked_sub(size_of::()) .ok_or(Error::InvalidHeaderLength)?; let mut buf = vec![0; len]; r.read_exact(&mut buf).map_err(Error::DecodeMessage)?; let mut components = buf.split_inclusive(|c| *c == b'\0'); let name = components.next().ok_or(Error::MissingParameter)?; let linkname = components.next().ok_or(Error::MissingParameter)?; let options = FsOptions::from_bits_truncate(self.options.load(Ordering::Relaxed)); let extensions = get_extensions(options, name.len() + linkname.len(), buf.as_slice())?; match self.fs.symlink( Context::from(in_header), bytes_to_cstr(linkname)?, in_header.nodeid.into(), bytes_to_cstr(name)?, extensions, ) { Ok(entry) => { let out = EntryOut::from(entry); reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn mknod(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let MknodIn { mode, rdev, umask, .. } = r.read_obj().map_err(Error::DecodeMessage)?; let remaining_len = (in_header.len as usize) .checked_sub(size_of::()) .and_then(|l| l.checked_sub(size_of::())) .ok_or(Error::InvalidHeaderLength)?; let mut buf = vec![0; remaining_len]; r.read_exact(&mut buf).map_err(Error::DecodeMessage)?; let mut components = buf.split_inclusive(|c| *c == b'\0'); let name = components.next().ok_or(Error::MissingParameter)?; let options = FsOptions::from_bits_truncate(self.options.load(Ordering::Relaxed)); let extensions = get_extensions(options, name.len(), buf.as_slice())?; match self.fs.mknod( Context::from(in_header), in_header.nodeid.into(), bytes_to_cstr(name)?, mode, rdev, umask, extensions, ) { Ok(entry) => { let out = EntryOut::from(entry); reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn mkdir(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let MkdirIn { mode, umask } = r.read_obj().map_err(Error::DecodeMessage)?; let remaining_len = (in_header.len as usize) .checked_sub(size_of::()) .and_then(|l| l.checked_sub(size_of::())) .ok_or(Error::InvalidHeaderLength)?; let mut buf = vec![0; remaining_len]; r.read_exact(&mut buf).map_err(Error::DecodeMessage)?; let mut components = buf.split_inclusive(|c| *c == b'\0'); let name = components.next().ok_or(Error::MissingParameter)?; let options = FsOptions::from_bits_truncate(self.options.load(Ordering::Relaxed)); let extensions = get_extensions(options, name.len(), buf.as_slice())?; match self.fs.mkdir( Context::from(in_header), in_header.nodeid.into(), bytes_to_cstr(name)?, mode, umask, extensions, ) { Ok(entry) => { let out = EntryOut::from(entry); reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn unlink(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let namelen = (in_header.len as usize) .checked_sub(size_of::()) .ok_or(Error::InvalidHeaderLength)?; let mut name = vec![0; namelen]; r.read_exact(&mut name).map_err(Error::DecodeMessage)?; match self.fs.unlink( Context::from(in_header), in_header.nodeid.into(), bytes_to_cstr(&name)?, ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn rmdir(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let namelen = (in_header.len as usize) .checked_sub(size_of::()) .ok_or(Error::InvalidHeaderLength)?; let mut name = vec![0; namelen]; r.read_exact(&mut name).map_err(Error::DecodeMessage)?; match self.fs.rmdir( Context::from(in_header), in_header.nodeid.into(), bytes_to_cstr(&name)?, ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn do_rename( &self, in_header: InHeader, msg_size: usize, newdir: u64, flags: u32, mut r: Reader, w: Writer, ) -> Result { let buflen = (in_header.len as usize) .checked_sub(size_of::()) .and_then(|l| l.checked_sub(msg_size)) .ok_or(Error::InvalidHeaderLength)?; let mut buf = vec![0; buflen]; r.read_exact(&mut buf).map_err(Error::DecodeMessage)?; // We want to include the '\0' byte in the first slice. let split_pos = buf .iter() .position(|c| *c == b'\0') .map(|p| p + 1) .ok_or(Error::MissingParameter)?; let (oldname, newname) = buf.split_at(split_pos); match self.fs.rename( Context::from(in_header), in_header.nodeid.into(), bytes_to_cstr(oldname)?, newdir.into(), bytes_to_cstr(newname)?, flags, ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn rename(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let RenameIn { newdir } = r.read_obj().map_err(Error::DecodeMessage)?; self.do_rename(in_header, size_of::(), newdir, 0, r, w) } fn rename2(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let Rename2In { newdir, flags, .. } = r.read_obj().map_err(Error::DecodeMessage)?; let flags = flags & (libc::RENAME_EXCHANGE | libc::RENAME_NOREPLACE | libc::RENAME_WHITEOUT); self.do_rename(in_header, size_of::(), newdir, flags, r, w) } fn link(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let LinkIn { oldnodeid } = r.read_obj().map_err(Error::DecodeMessage)?; let namelen = (in_header.len as usize) .checked_sub(size_of::()) .and_then(|l| l.checked_sub(size_of::())) .ok_or(Error::InvalidHeaderLength)?; let mut name = vec![0; namelen]; r.read_exact(&mut name).map_err(Error::DecodeMessage)?; match self.fs.link( Context::from(in_header), oldnodeid.into(), in_header.nodeid.into(), bytes_to_cstr(&name)?, ) { Ok(entry) => { let out = EntryOut::from(entry); reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn open(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let OpenIn { flags, open_flags, .. } = r.read_obj().map_err(Error::DecodeMessage)?; let kill_priv = open_flags & OPEN_KILL_SUIDGID != 0; match self.fs.open( Context::from(in_header), in_header.nodeid.into(), kill_priv, flags, ) { Ok((handle, opts)) => { let out = OpenOut { fh: handle.map(Into::into).unwrap_or(0), open_flags: opts.bits(), ..Default::default() }; reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn read(&self, in_header: InHeader, mut r: Reader, mut w: Writer) -> Result { let ReadIn { fh, offset, size, read_flags, lock_owner, flags, .. } = r.read_obj().map_err(Error::DecodeMessage)?; let owner = if read_flags & READ_LOCKOWNER != 0 { Some(lock_owner) } else { None }; // Split the writer into 2 pieces: one for the `OutHeader` and the rest for the data. let data_writer = ZcWriter(w.split_at(size_of::()).unwrap()); match self.fs.read( Context::from(in_header), in_header.nodeid.into(), fh.into(), data_writer, size, offset, owner, flags, ) { Ok(count) => { // Don't use `reply_ok` because we need to set a custom size length for the // header. let out = OutHeader { len: (size_of::() + count) as u32, error: 0, unique: in_header.unique, }; debug!("Replying OK, header: {:?}", out); w.write_all(out.as_slice()).map_err(Error::EncodeMessage)?; Ok(out.len as usize) } Err(e) => reply_error(e, in_header.unique, w), } } fn write(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let WriteIn { fh, offset, size, write_flags, lock_owner, flags, .. } = r.read_obj().map_err(Error::DecodeMessage)?; let owner = if write_flags & WRITE_LOCKOWNER != 0 { Some(lock_owner) } else { None }; let delayed_write = write_flags & WRITE_CACHE != 0; let kill_priv = write_flags & WRITE_KILL_PRIV != 0; let data_reader = ZcReader(r); match self.fs.write( Context::from(in_header), in_header.nodeid.into(), fh.into(), data_reader, size, offset, owner, delayed_write, kill_priv, flags, ) { Ok(count) => { let out = WriteOut { size: count as u32, ..Default::default() }; reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn statfs(&self, in_header: InHeader, w: Writer) -> Result { match self .fs .statfs(Context::from(in_header), in_header.nodeid.into()) { Ok(st) => reply_ok(Some(Kstatfs::from(st)), None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn release(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let ReleaseIn { fh, flags, release_flags, lock_owner, } = r.read_obj().map_err(Error::DecodeMessage)?; let flush = release_flags & RELEASE_FLUSH != 0; let flock_release = release_flags & RELEASE_FLOCK_UNLOCK != 0; let lock_owner = if flush || flock_release { Some(lock_owner) } else { None }; match self.fs.release( Context::from(in_header), in_header.nodeid.into(), flags, fh.into(), flush, flock_release, lock_owner, ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn fsync(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let FsyncIn { fh, fsync_flags, .. } = r.read_obj().map_err(Error::DecodeMessage)?; let datasync = fsync_flags & 0x1 != 0; match self.fs.fsync( Context::from(in_header), in_header.nodeid.into(), datasync, fh.into(), ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn setxattr(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let options = FsOptions::from_bits_truncate(self.options.load(Ordering::Relaxed)); let ( SetxattrIn { size, flags, setxattr_flags, .. }, setxattrin_size, ) = if options.contains(FsOptions::SETXATTR_EXT) { ( r.read_obj().map_err(Error::DecodeMessage)?, size_of::(), ) } else { let SetxattrInCompat { size, flags } = r.read_obj().map_err(Error::DecodeMessage)?; ( SetxattrIn { size, flags, setxattr_flags: 0, padding: 0, }, size_of::(), ) }; // The name and value and encoded one after another and separated by a '\0' character. let len = (in_header.len as usize) .checked_sub(size_of::()) .and_then(|l| l.checked_sub(setxattrin_size)) .ok_or(Error::InvalidHeaderLength)?; let mut buf = vec![0; len]; r.read_exact(&mut buf).map_err(Error::DecodeMessage)?; // We want to include the '\0' byte in the first slice. let split_pos = buf .iter() .position(|c| *c == b'\0') .map(|p| p + 1) .ok_or(Error::MissingParameter)?; let (name, value) = buf.split_at(split_pos); if size != value.len() as u32 { return Err(Error::InvalidXattrSize((size, value.len()))); } match self.fs.setxattr( Context::from(in_header), in_header.nodeid.into(), bytes_to_cstr(name)?, value, flags, SetxattrFlags::from_bits_truncate(setxattr_flags), ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn getxattr(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let GetxattrIn { size, .. } = r.read_obj().map_err(Error::DecodeMessage)?; let namelen = (in_header.len as usize) .checked_sub(size_of::()) .and_then(|l| l.checked_sub(size_of::())) .ok_or(Error::InvalidHeaderLength)?; let mut name = vec![0; namelen]; r.read_exact(&mut name).map_err(Error::DecodeMessage)?; if size > MAX_BUFFER_SIZE { return reply_error( io::Error::from_raw_os_error(libc::ENOMEM), in_header.unique, w, ); } match self.fs.getxattr( Context::from(in_header), in_header.nodeid.into(), bytes_to_cstr(&name)?, size, ) { Ok(GetxattrReply::Value(val)) => reply_ok(None::, Some(&val), in_header.unique, w), Ok(GetxattrReply::Count(count)) => { let out = GetxattrOut { size: count, ..Default::default() }; reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn listxattr(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let GetxattrIn { size, .. } = r.read_obj().map_err(Error::DecodeMessage)?; if size > MAX_BUFFER_SIZE { return reply_error( io::Error::from_raw_os_error(libc::ENOMEM), in_header.unique, w, ); } match self .fs .listxattr(Context::from(in_header), in_header.nodeid.into(), size) { Ok(ListxattrReply::Names(val)) => reply_ok(None::, Some(&val), in_header.unique, w), Ok(ListxattrReply::Count(count)) => { let out = GetxattrOut { size: count, ..Default::default() }; reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn removexattr(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let namelen = (in_header.len as usize) .checked_sub(size_of::()) .ok_or(Error::InvalidHeaderLength)?; let mut buf = vec![0; namelen]; r.read_exact(&mut buf).map_err(Error::DecodeMessage)?; let name = bytes_to_cstr(&buf)?; match self .fs .removexattr(Context::from(in_header), in_header.nodeid.into(), name) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn flush(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let FlushIn { fh, lock_owner, .. } = r.read_obj().map_err(Error::DecodeMessage)?; match self.fs.flush( Context::from(in_header), in_header.nodeid.into(), fh.into(), lock_owner, ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn init(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let InitInCompat { major, minor, max_readahead, flags, } = r.read_obj().map_err(Error::DecodeMessage)?; let options = FsOptions::from_bits_truncate(flags as u64); let InitInExt { flags2, .. } = if options.contains(FsOptions::INIT_EXT) { r.read_obj().map_err(Error::DecodeMessage)? } else { InitInExt::default() }; if major < KERNEL_VERSION { error!("Unsupported fuse protocol version: {}.{}", major, minor); return reply_error( io::Error::from_raw_os_error(libc::EPROTO), in_header.unique, w, ); } if major > KERNEL_VERSION { // Wait for the kernel to reply back with a 7.X version. let out = InitOut { major: KERNEL_VERSION, minor: KERNEL_MINOR_VERSION, ..Default::default() }; return reply_ok(Some(out), None, in_header.unique, w); } if minor < MIN_KERNEL_MINOR_VERSION { error!( "Unsupported fuse protocol minor version: {}.{}", major, minor ); return reply_error( io::Error::from_raw_os_error(libc::EPROTO), in_header.unique, w, ); } // These fuse features are supported by this server by default. let supported = FsOptions::ASYNC_READ | FsOptions::PARALLEL_DIROPS | FsOptions::BIG_WRITES | FsOptions::AUTO_INVAL_DATA | FsOptions::ASYNC_DIO | FsOptions::HAS_IOCTL_DIR | FsOptions::ATOMIC_O_TRUNC | FsOptions::MAX_PAGES | FsOptions::SUBMOUNTS | FsOptions::INIT_EXT | FsOptions::CREATE_SUPP_GROUP; let flags_64 = ((flags2 as u64) << 32) | (flags as u64); let capable = FsOptions::from_bits_truncate(flags_64); let page_size: u32 = unsafe { libc::sysconf(libc::_SC_PAGESIZE).try_into().unwrap() }; let max_pages = ((MAX_BUFFER_SIZE - 1) / page_size) + 1; match self.fs.init(capable) { Ok(want) => { let enabled = (capable & (want | supported)).bits(); self.options.store(enabled, Ordering::Relaxed); let out = InitOut { major: KERNEL_VERSION, minor: KERNEL_MINOR_VERSION, max_readahead, flags: enabled as u32, max_background: u16::MAX, congestion_threshold: (u16::MAX / 4) * 3, max_write: MAX_BUFFER_SIZE, time_gran: 1, // nanoseconds max_pages: max_pages.try_into().unwrap(), map_alignment: 0, flags2: (enabled >> 32) as u32, ..Default::default() }; reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn opendir(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let OpenIn { flags, .. } = r.read_obj().map_err(Error::DecodeMessage)?; match self .fs .opendir(Context::from(in_header), in_header.nodeid.into(), flags) { Ok((handle, opts)) => { let out = OpenOut { fh: handle.map(Into::into).unwrap_or(0), open_flags: opts.bits(), ..Default::default() }; reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn readdir(&self, in_header: InHeader, mut r: Reader, mut w: Writer) -> Result { let ReadIn { fh, offset, size, .. } = r.read_obj().map_err(Error::DecodeMessage)?; if size > MAX_BUFFER_SIZE { return reply_error( io::Error::from_raw_os_error(libc::ENOMEM), in_header.unique, w, ); } let available_bytes = w.available_bytes(); if available_bytes < size as usize { return reply_error( io::Error::from_raw_os_error(libc::ENOMEM), in_header.unique, w, ); } // Skip over enough bytes for the header. let unique = in_header.unique; let mut cursor = w.split_at(size_of::()).unwrap(); let result = match self.fs.readdir( Context::from(in_header), in_header.nodeid.into(), fh.into(), size, offset, ) { Ok(mut entries) => { let mut total_written = 0; let mut err = None; while let Some(dirent) = entries.next() { let remaining = (size as usize).saturating_sub(total_written); match add_dirent(&mut cursor, remaining, dirent, None) { // No more space left in the buffer. Ok(0) => break, Ok(bytes_written) => { total_written += bytes_written; } Err(e) => { err = Some(e); break; } } } if let Some(err) = err { Err(err) } else { Ok(total_written) } } Err(e) => Err(e), }; match result { Ok(total_written) => reply_readdir(total_written, unique, w), Err(e) => reply_error(e, unique, w), } } fn handle_dirent<'d>( &self, in_header: &InHeader, dir_entry: DirEntry<'d>, ) -> io::Result<(DirEntry<'d>, Entry)> { let parent = in_header.nodeid.into(); let name = dir_entry.name.to_bytes(); let entry = if name == CURRENT_DIR_CSTR || name == PARENT_DIR_CSTR { // Don't do lookups on the current directory or the parent directory. Safe because // this only contains integer fields and any value is valid. let mut attr = unsafe { MaybeUninit::::zeroed().assume_init() }; attr.st_ino = dir_entry.ino; attr.st_mode = dir_entry.type_ << 12; // We use 0 for the inode value to indicate a negative entry. Entry { inode: 0, generation: 0, attr, attr_flags: 0, attr_timeout: Duration::from_secs(0), entry_timeout: Duration::from_secs(0), } } else { self.fs .lookup(Context::from(*in_header), parent, dir_entry.name)? }; Ok((dir_entry, entry)) } fn readdirplus(&self, in_header: InHeader, mut r: Reader, mut w: Writer) -> Result { let ReadIn { fh, offset, size, .. } = r.read_obj().map_err(Error::DecodeMessage)?; if size > MAX_BUFFER_SIZE { return reply_error( io::Error::from_raw_os_error(libc::ENOMEM), in_header.unique, w, ); } let available_bytes = w.available_bytes(); if available_bytes < size as usize { return reply_error( io::Error::from_raw_os_error(libc::ENOMEM), in_header.unique, w, ); } // Skip over enough bytes for the header. let unique = in_header.unique; let mut cursor = w.split_at(size_of::()).unwrap(); let result = match self.fs.readdir( Context::from(in_header), in_header.nodeid.into(), fh.into(), size, offset, ) { Ok(mut entries) => { let mut total_written = 0; let mut err = None; while let Some(dirent) = entries.next() { let mut entry_inode = None; match self.handle_dirent(&in_header, dirent).and_then(|(d, e)| { entry_inode = Some(e.inode); let remaining = (size as usize).saturating_sub(total_written); add_dirent(&mut cursor, remaining, d, Some(e)) }) { Ok(0) => { // No more space left in the buffer but we need to undo the lookup // that created the Entry or we will end up with mismatched lookup // counts. if let Some(inode) = entry_inode { self.fs.forget(Context::from(in_header), inode.into(), 1); } break; } Ok(bytes_written) => { total_written += bytes_written; } Err(e) => { if let Some(inode) = entry_inode { self.fs.forget(Context::from(in_header), inode.into(), 1); } if total_written == 0 { // We haven't filled any entries yet so we can just propagate // the error. err = Some(e); } // We already filled in some entries. Returning an error now will // cause lookup count mismatches for those entries so just return // whatever we already have. break; } } } if let Some(err) = err { Err(err) } else { Ok(total_written) } } Err(e) => Err(e), }; match result { Ok(total_written) => reply_readdir(total_written, unique, w), Err(e) => reply_error(e, unique, w), } } fn releasedir(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let ReleaseIn { fh, flags, .. } = r.read_obj().map_err(Error::DecodeMessage)?; match self.fs.releasedir( Context::from(in_header), in_header.nodeid.into(), flags, fh.into(), ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn fsyncdir(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let FsyncIn { fh, fsync_flags, .. } = r.read_obj().map_err(Error::DecodeMessage)?; let datasync = fsync_flags & 0x1 != 0; match self.fs.fsyncdir( Context::from(in_header), in_header.nodeid.into(), datasync, fh.into(), ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn getlk(&self, in_header: InHeader, mut _r: Reader, w: Writer) -> Result { if let Err(e) = self.fs.getlk() { reply_error(e, in_header.unique, w) } else { Ok(0) } } fn setlk(&self, in_header: InHeader, mut _r: Reader, w: Writer) -> Result { if let Err(e) = self.fs.setlk() { reply_error(e, in_header.unique, w) } else { Ok(0) } } fn setlkw(&self, in_header: InHeader, mut _r: Reader, w: Writer) -> Result { if let Err(e) = self.fs.setlkw() { reply_error(e, in_header.unique, w) } else { Ok(0) } } fn access(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let AccessIn { mask, .. } = r.read_obj().map_err(Error::DecodeMessage)?; match self .fs .access(Context::from(in_header), in_header.nodeid.into(), mask) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn create(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let CreateIn { flags, mode, umask, open_flags, .. } = r.read_obj().map_err(Error::DecodeMessage)?; let remaining_len = (in_header.len as usize) .checked_sub(size_of::()) .and_then(|l| l.checked_sub(size_of::())) .ok_or(Error::InvalidHeaderLength)?; let mut buf = vec![0; remaining_len]; r.read_exact(&mut buf).map_err(Error::DecodeMessage)?; let mut components = buf.split_inclusive(|c| *c == b'\0'); let name = components.next().ok_or(Error::MissingParameter)?; let options = FsOptions::from_bits_truncate(self.options.load(Ordering::Relaxed)); let extensions = get_extensions(options, name.len(), buf.as_slice())?; let kill_priv = open_flags & OPEN_KILL_SUIDGID != 0; match self.fs.create( Context::from(in_header), in_header.nodeid.into(), bytes_to_cstr(name)?, mode, kill_priv, flags, umask, extensions, ) { Ok((entry, handle, opts)) => { let entry_out = EntryOut { nodeid: entry.inode, generation: entry.generation, entry_valid: entry.entry_timeout.as_secs(), attr_valid: entry.attr_timeout.as_secs(), entry_valid_nsec: entry.entry_timeout.subsec_nanos(), attr_valid_nsec: entry.attr_timeout.subsec_nanos(), attr: Attr::with_flags(entry.attr, entry.attr_flags), }; let open_out = OpenOut { fh: handle.map(Into::into).unwrap_or(0), open_flags: opts.bits(), ..Default::default() }; // Kind of a hack to write both structs. reply_ok( Some(entry_out), Some(open_out.as_slice()), in_header.unique, w, ) } Err(e) => reply_error(e, in_header.unique, w), } } fn interrupt(&self, _in_header: InHeader) -> usize { 0 } fn bmap(&self, in_header: InHeader, mut _r: Reader, w: Writer) -> Result { if let Err(e) = self.fs.bmap() { reply_error(e, in_header.unique, w) } else { Ok(0) } } fn destroy(&self) -> usize { // No reply to this function. self.fs.destroy(); self.options .store(FsOptions::empty().bits(), Ordering::Relaxed); 0 } fn ioctl(&self, in_header: InHeader, _r: Reader, w: Writer) -> Result { if let Err(e) = self.fs.ioctl() { reply_error(e, in_header.unique, w) } else { Ok(0) } } fn poll(&self, in_header: InHeader, mut _r: Reader, w: Writer) -> Result { if let Err(e) = self.fs.poll() { reply_error(e, in_header.unique, w) } else { Ok(0) } } fn notify_reply(&self, in_header: InHeader, mut _r: Reader, w: Writer) -> Result { if let Err(e) = self.fs.notify_reply() { reply_error(e, in_header.unique, w) } else { Ok(0) } } fn batch_forget(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let BatchForgetIn { count, .. } = r.read_obj().map_err(Error::DecodeMessage)?; if let Some(size) = (count as usize).checked_mul(size_of::()) { if size > MAX_BUFFER_SIZE as usize { return reply_error( io::Error::from_raw_os_error(libc::ENOMEM), in_header.unique, w, ); } } else { return reply_error( io::Error::from_raw_os_error(libc::EOVERFLOW), in_header.unique, w, ); } let mut requests = Vec::with_capacity(count as usize); for _ in 0..count { requests.push( r.read_obj::() .map(|f| (f.nodeid.into(), f.nlookup)) .map_err(Error::DecodeMessage)?, ); } self.fs.batch_forget(Context::from(in_header), requests); // No reply for forget messages. Ok(0) } fn fallocate(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let FallocateIn { fh, offset, length, mode, .. } = r.read_obj().map_err(Error::DecodeMessage)?; match self.fs.fallocate( Context::from(in_header), in_header.nodeid.into(), fh.into(), mode, offset, length, ) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn lseek(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let LseekIn { fh, offset, whence, .. } = r.read_obj().map_err(Error::DecodeMessage)?; match self.fs.lseek( Context::from(in_header), in_header.nodeid.into(), fh.into(), offset, whence, ) { Ok(offset) => { let out = LseekOut { offset }; reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn copyfilerange(&self, in_header: InHeader, mut r: Reader, w: Writer) -> Result { let CopyfilerangeIn { fh_in, off_in, nodeid_out, fh_out, off_out, len, flags, .. } = r.read_obj().map_err(Error::DecodeMessage)?; match self.fs.copyfilerange( Context::from(in_header), in_header.nodeid.into(), fh_in.into(), off_in, nodeid_out.into(), fh_out.into(), off_out, len, flags, ) { Ok(count) => { let out = WriteOut { size: count as u32, ..Default::default() }; reply_ok(Some(out), None, in_header.unique, w) } Err(e) => reply_error(e, in_header.unique, w), } } fn syncfs(&self, in_header: InHeader, w: Writer) -> Result { match self .fs .syncfs(Context::from(in_header), in_header.nodeid.into()) { Ok(()) => reply_ok(None::, None, in_header.unique, w), Err(e) => reply_error(e, in_header.unique, w), } } fn tmpfile(&self, in_header: InHeader, _r: Reader, w: Writer) -> Result { let e = self .fs .tmpfile() .err() .unwrap_or_else(|| panic!("unsupported operation")); reply_error(e, in_header.unique, w) } } fn reply_readdir(len: usize, unique: u64, mut w: Writer) -> Result { let out = OutHeader { len: (size_of::() + len) as u32, error: 0, unique, }; debug!("Replying OK, header: {:?}", out); w.write_all(out.as_slice()).map_err(Error::EncodeMessage)?; w.flush().map_err(Error::FlushMessage)?; Ok(out.len as usize) } fn reply_ok( out: Option, data: Option<&[u8]>, unique: u64, mut w: Writer, ) -> Result { let mut len = size_of::(); if out.is_some() { len += size_of::(); } if let Some(data) = data { len += data.len(); } let header = OutHeader { len: len as u32, error: 0, unique, }; debug!("Replying OK, header: {:?}", header); w.write_all(header.as_slice()) .map_err(Error::EncodeMessage)?; if let Some(out) = out { w.write_all(out.as_slice()).map_err(Error::EncodeMessage)?; } if let Some(data) = data { w.write_all(data).map_err(Error::EncodeMessage)?; } debug_assert_eq!(len, w.bytes_written()); Ok(w.bytes_written()) } fn strerror(error: i32) -> String { let mut err_desc: Vec = vec![0; 256]; let buf_ptr = err_desc.as_mut_ptr() as *mut libc::c_char; // Safe because libc::strerror_r writes in err_desc at most err_desc.len() bytes unsafe { // We ignore the returned value since the two possible error values are: // EINVAL and ERANGE, in the former err_desc will be "Unknown error #" // and in the latter the message will be truncated to fit err_desc libc::strerror_r(error, buf_ptr, err_desc.len()); } let err_desc = err_desc.split(|c| *c == b'\0').next().unwrap(); String::from_utf8(err_desc.to_vec()).unwrap_or_else(|_| "".to_owned()) } fn reply_error(e: io::Error, unique: u64, mut w: Writer) -> Result { let header = OutHeader { len: size_of::() as u32, error: -e.raw_os_error().unwrap_or(libc::EIO), unique, }; debug!( "Replying ERROR, header: OutHeader {{ error: {} ({}), unique: {}, len: {} }}", header.error, strerror(-header.error), header.unique, header.len ); w.write_all(header.as_slice()) .map_err(Error::EncodeMessage)?; debug_assert_eq!(header.len as usize, w.bytes_written()); Ok(w.bytes_written()) } fn bytes_to_cstr(buf: &[u8]) -> Result<&CStr> { // Convert to a `CStr` first so that we can drop the '\0' byte at the end // and make sure there are no interior '\0' bytes. CStr::from_bytes_with_nul(buf).map_err(Error::InvalidCString) } fn add_dirent( cursor: &mut Writer, max: usize, d: DirEntry, entry: Option, ) -> io::Result { // Strip the trailing '\0'. let name = d.name.to_bytes(); if name.len() > u32::MAX as usize { return Err(io::Error::from_raw_os_error(libc::EOVERFLOW)); } let dirent_len = size_of::() .checked_add(name.len()) .ok_or_else(|| io::Error::from_raw_os_error(libc::EOVERFLOW))?; // Directory entries must be padded to 8-byte alignment. If adding 7 causes // an overflow then this dirent cannot be properly padded. let padded_dirent_len = dirent_len .checked_add(7) .map(|l| l & !7) .ok_or_else(|| io::Error::from_raw_os_error(libc::EOVERFLOW))?; let total_len = if entry.is_some() { padded_dirent_len .checked_add(size_of::()) .ok_or_else(|| io::Error::from_raw_os_error(libc::EOVERFLOW))? } else { padded_dirent_len }; if max < total_len { Ok(0) } else { if let Some(entry) = entry { cursor.write_all(EntryOut::from(entry).as_slice())?; } let dirent = Dirent { ino: d.ino, off: d.offset, namelen: name.len() as u32, type_: d.type_, }; cursor.write_all(dirent.as_slice())?; cursor.write_all(name)?; // We know that `dirent_len` <= `padded_dirent_len` due to the check above // so there's no need for checked arithmetic. let padding = padded_dirent_len - dirent_len; if padding > 0 { cursor.write_all(&DIRENT_PADDING[..padding])?; } Ok(total_len) } } fn take_object(data: &[u8]) -> Result<(T, &[u8])> { if data.len() < size_of::() { return Err(Error::DecodeMessage(einval())); } let (object_bytes, remaining_bytes) = data.split_at(size_of::()); // SAFETY: `T` implements `ByteValued` that guarantees that it is safe to instantiate // `T` with random data. let object: T = unsafe { std::ptr::read_unaligned(object_bytes.as_ptr() as *const T) }; Ok((object, remaining_bytes)) } fn parse_security_context(nr_secctx: u32, data: &[u8]) -> Result> { // Although the FUSE security context extension allows sending several security contexts, // currently the guest kernel only sends one. if nr_secctx > 1 { return Err(Error::DecodeMessage(einval())); } else if nr_secctx == 0 { // No security context sent. May be no LSM supports it. return Ok(None); } let (secctx, data) = take_object::(data)?; if secctx.size == 0 { return Err(Error::DecodeMessage(einval())); } let mut components = data.split_inclusive(|c| *c == b'\0'); let secctx_name = components.next().ok_or(Error::MissingParameter)?; let (_, data) = data.split_at(secctx_name.len()); if data.len() < secctx.size as usize { return Err(Error::DecodeMessage(einval())); } // Fuse client aligns the whole security context block to 64 byte // boundary. So it is possible that after actual security context // of secctx.size, there are some null padding bytes left. If // we ever parse more data after secctx, we will have to take those // null bytes into account. Total size (including null bytes) is // available in SecctxHeader->size. let (remaining, _) = data.split_at(secctx.size as usize); let fuse_secctx = SecContext { name: CString::from_vec_with_nul(secctx_name.to_vec()).map_err(Error::InvalidCString2)?, secctx: remaining.to_vec(), }; Ok(Some(fuse_secctx)) } fn parse_sup_groups(data: &[u8]) -> Result { let (group_header, group_id_bytes) = take_object::(data)?; // The FUSE extension allows sending several group IDs, but currently the guest // kernel only sends one. if group_header.nr_groups != 1 { return Err(Error::DecodeMessage(einval())); } let (gid, _) = take_object::(group_id_bytes)?; Ok(gid) } fn get_extensions(options: FsOptions, skip: usize, request_bytes: &[u8]) -> Result { let mut extensions = Extensions::default(); if !(options.contains(FsOptions::SECURITY_CTX) || options.contains(FsOptions::CREATE_SUPP_GROUP)) { return Ok(extensions); } // It's not guaranty to receive an extension even if it's supported by the guest kernel if request_bytes.len() < skip { return Err(Error::DecodeMessage(einval())); } // We need to track if a SecCtx was received, because it's valid // for the guest to send an empty SecCtx (i.e, nr_secctx == 0) let mut secctx_received = false; let mut buf = &request_bytes[skip..]; while !buf.is_empty() { let (extension_header, remaining_bytes) = take_object::(buf)?; let extension_size = (extension_header.size as usize) .checked_sub(size_of::()) .ok_or(Error::InvalidHeaderLength)?; let (current_extension_bytes, next_extension_bytes) = remaining_bytes.split_at(extension_size); let ext_type = ExtType::try_from(extension_header.ext_type) .map_err(|_| Error::DecodeMessage(einval()))?; match ext_type { ExtType::SecCtx(nr_secctx) => { if !options.contains(FsOptions::SECURITY_CTX) || secctx_received { return Err(Error::DecodeMessage(einval())); } secctx_received = true; extensions.secctx = parse_security_context(nr_secctx, current_extension_bytes)?; debug!("Extension received: {} SecCtx", nr_secctx); } ExtType::SupGroups => { if !options.contains(FsOptions::CREATE_SUPP_GROUP) || extensions.sup_gid.is_some() { return Err(Error::DecodeMessage(einval())); } extensions.sup_gid = parse_sup_groups(current_extension_bytes)?.into(); debug!("Extension received: SupGroups({:?})", extensions.sup_gid); } } // Let's process the next extension buf = next_extension_bytes; } // The SupGroup extension can be missing, since it is only sent if needed. // A SecCtx is always sent in create/synlink/mknod/mkdir if supported. if options.contains(FsOptions::SECURITY_CTX) && !secctx_received { return Err(Error::MissingExtension); } Ok(extensions) } virtiofsd-1.10.0/src/util.rs000064400000000000000000000133051046102023000140300ustar 00000000000000// Copyright 2022 Red Hat, Inc. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use std::fs::{File, OpenOptions}; use std::io::{Error, ErrorKind, Write}; use std::os::unix::fs::{MetadataExt, OpenOptionsExt}; use std::os::unix::io::{AsRawFd, FromRawFd}; use std::path::Path; use std::result::Result; use std::{fs, io, process}; fn try_lock_file(file: &File) -> Result<(), Error> { // Safe because 'file' must exist and we check the return value. let file_fd = file.as_raw_fd(); let ret = unsafe { libc::flock(file_fd, libc::LOCK_EX | libc::LOCK_NB) }; if ret == -1 { return Err(Error::last_os_error()); } Ok(()) } pub fn write_pid_file(pid_file_name: &Path) -> std::result::Result { let mut pid_file = loop { let file = OpenOptions::new() .mode(libc::S_IRUSR | libc::S_IWUSR) .custom_flags(libc::O_CLOEXEC) .write(true) .create(true) .open(pid_file_name)?; try_lock_file(&file)?; // Let's make sure the file we locked still exists in the filesystem. let locked = file.metadata()?.ino(); let current = match fs::metadata(pid_file_name) { Ok(stat) => stat.ino(), _ => continue, // the pid file got removed or some error happened, try again. }; if locked == current { break file; // lock successfully acquired. } // the file changed, other process is racing with us, so try again. }; let pid = format!("{}\n", process::id()); pid_file.write_all(pid.as_bytes())?; Ok(pid_file) } unsafe fn pidfd_open(pid: libc::pid_t, flags: libc::c_uint) -> libc::c_int { libc::syscall(libc::SYS_pidfd_open, pid, flags) as libc::c_int } /// Helper function to create a process and sets the parent process /// death signal SIGTERM pub fn sfork() -> io::Result { let cur_pid = unsafe { libc::getpid() }; // We use pidfd_open(2) to check the parent's pid because if the // child is created inside a pid namespace, getppid(2) will always // return 0 let parent_pidfd = unsafe { pidfd_open(cur_pid, 0) }; if parent_pidfd == -1 { return Err(Error::last_os_error()); } // We wrap the parent PID file descriptor in a File object to ensure that is // auto-closed when it goes out of scope. But, since nothing can be read, using read(2), // from a PID file descriptor returned by pidfd_open(2) (it fails with EINVAL), we // use a new type PidFd to prevent using the File's methods directly, and in the hope // that whoever wants to do so will read this first. // This is a temporary solution until OwnedFd is stabilized. struct PidFd(File); let _pidfd = unsafe { PidFd(File::from_raw_fd(parent_pidfd)) }; let child_pid = unsafe { libc::fork() }; if child_pid == -1 { return Err(Error::last_os_error()); } if child_pid == 0 { // Request to receive SIGTERM on parent's death. let ret = unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM) }; assert_eq!(ret, 0); // This shouldn't fail because libc::SIGTERM is a valid signal number // Check if the original parent died before libc::prctl() was called let mut pollfds = libc::pollfd { fd: parent_pidfd, events: libc::POLLIN, revents: 0, }; let num_fds = unsafe { libc::poll(&mut pollfds, 1, 0) }; if num_fds == -1 { return Err(io::Error::last_os_error()); } if num_fds != 0 { // The original parent died return Err(Error::new( ErrorKind::Other, "Parent process died unexpectedly", )); } } Ok(child_pid) } pub fn wait_for_child(pid: i32) -> ! { // Drop all capabilities, since the parent doesn't require any // capabilities, as it'd be just waiting for the child to exit. capng::clear(capng::Set::BOTH); if let Err(e) = capng::apply(capng::Set::BOTH) { // Don't exit the process here since we already have a child. error!("warning: can't apply the parent capabilities: {}", e); } let mut status = 0; // On success, `libc::waitpid()` returns the PID of the child. if unsafe { libc::waitpid(pid, &mut status, 0) } != pid { error!("Error during waitpid()"); process::exit(1); } let exit_code = if libc::WIFEXITED(status) { libc::WEXITSTATUS(status) } else if libc::WIFSIGNALED(status) { let signal = libc::WTERMSIG(status); error!("Child process terminated by signal {}", signal); -signal } else { error!("Unexpected waitpid status: {:#X}", status); libc::EXIT_FAILURE }; process::exit(exit_code); } /// Add a capability to the effective set /// # Errors /// An error variant will be returned: /// - if the input string does not match the name, without the 'CAP_' prefix, /// of any of the capability defined in `linux/capabiliy.h`. /// - if `capng::get_caps_process()` cannot get the capabilities and bounding set of the process. /// - if `capng::update()` fails to update the internal posix capabilities settings. /// - if `capng::apply()` fails to transfer the specified internal posix capabilities /// settings to the kernel. pub fn add_cap_to_eff(cap_name: &str) -> capng::Result<()> { use capng::{Action, CUpdate, Set, Type}; let cap = capng::name_to_capability(cap_name)?; capng::get_caps_process()?; let req = vec![CUpdate { action: Action::ADD, cap_type: Type::EFFECTIVE, capability: cap, }]; capng::update(req)?; capng::apply(Set::CAPS)?; Ok(()) }